In [None]:
import json
from groq import Groq
import httpx
import os

from dotenv import load_dotenv
load_dotenv()

# Initialize Groq client
client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
    http_client=httpx.Client()
)

def transcribe_audio(audio_path, brand_name):
    """Transcribe audio file using Whisper"""
    print(f"Transcribing audio file: {audio_path} for brand: {brand_name}")
    try:
        with open(audio_path, "rb") as file:
            transcription = client.audio.translations.create(
                file=(audio_path, file.read()),
                model="whisper-large-v3",
                response_format="verbose_json",
                prompt=brand_name
            )
        print(f"Successfully transcribed audio. Text length: {len(transcription.text)}")
        return transcription.text
    except Exception as e:
        print(f"Error transcribing audio: {str(e)}")
        raise


def levenshtein_distance(s1, s2):
    """Calculate Levenshtein distance between two strings"""
    m, n = len(s1), len(s2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[m][n]


def closest_match(string_list, input_string):
    """Find closest matching string using Levenshtein distance"""
    print(f"Finding closest match for '{input_string}' in list of {len(string_list)} items")
    if input_string in string_list:
        print(f"Exact match found: {input_string}")
        return input_string

    closest_string = min(string_list, key=lambda s: levenshtein_distance(s, input_string))
    print(f"Closest match: '{input_string}' -> '{closest_string}'")
    return closest_string


def process_transcript(transcript, brand_name, complaint_reasons, enquiry_reasons,
                      request_reasons, handled_list, master_outlet_id):
    """Process transcript using GPT"""
    print(f"Processing transcript for brand: {brand_name}, outlet: {master_outlet_id}")
    print(f"Transcript length: {len(transcript)}")

    if len(transcript) < 100:
        print("Transcript too short, returning empty output")
        return {
            "reason_type": "",
            "reason_verbatim": "",
            "reason": "",
            "end_of_call_status": "",
            "products_mentioned": [],
            "overall_sentiment": "",
            "emotions": [],
            "customer_type": "",
            "customer_gender": "",
            "summary": "",
            "transcript": ""
        }

    response_format = """{
        "reason_type": "Complaint",
        "reason_verbatim": "",
        "reason": "",
        "end_of_call_status": "",
        "products_mentioned": [
            {
                "product": "",
                "product_sentiment": "",
                "product_verbatim": "",
                "tags": ["", ""]
            }
        ],
        "overall_sentiment": "",
        "emotions": [
            {
                "emotion": "",
                "emotion_verbatim": ""
            }
        ],
        "customer_type": "",
        "customer_gender": "",
        "summary": "",
        "transcript": [
            "agent: how can i help you today?",
            "customer: i want to give my suit for alteration.",
            "agent:..."
        ]
    }"""

    prompt = f"""this is a call to a brand {brand_name}: {transcript}

        I want to understand why they called under 3 headings Request, Complaint or Enquiry
        Complaints are obvious, Request if they needed support with something and Enquiry for sales based calls
        Tag each call as one or the other. Reason cannot be null or empty.
        I also want a verbatim extract which explains the categorisation

        If the reason type is complaint, then pick the reason from this list only {complaint_reasons}
        If enquiry, then pick the reason from this list only {enquiry_reasons}
        and if request, then pick the reason from this list only {request_reasons}
        Make sure you pick the reasons only from the list provided.

        I also want to know how the complaint was handled, how the support request was handled and how the enquiry was addressed - these should be picked from this list {handled_list}.

        Pull out the products mentioned from the call. Based on each product, i want you to pull out the sentiment towards that product specifically, it can be either positive, negative or neutral and the verbatim extracts that back up the sentiment you pulled out.

        determine the overall sentiment of the call, it can be positive, negative or neutral.

        Emotions can be chosen from the list (neutral, sadness, anger, frustration, happiness, fear, confusion, frustration, satisfaction) respond with neutral if no clear indications of other emotions are present.

        Give me the customer type also. The customer type can have one of two values either 'New', 'Existing' or 'Unsure', decide this on the basis of the conversation taking place.

        With this, also pull out the customer gender. It can be either Male, Female or Unsure depending on how the customer is addressed in the call.

        Also generate a small summary of the call.

        Also get the diarization of the speakers in the form of a dialogue exchange between the customer and the agent.

        you can give me the results as a json like so {response_format}
        make sure the response is always in a valid json."""

    try:
        print("Calling GPT for transcript analysis")
        completion = client.chat.completions.create(
            model="openai/gpt-oss-120b",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that replies with exactly what is asked and in the same exact format every time."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            response_format={"type": "json_object"},
            max_completion_tokens=30000,
            reasoning_effort="low",
        )

        output = json.loads(completion.choices[0].message.content)
        print("Successfully parsed GPT response")

        # Basic validation
        if output['overall_sentiment'].lower().capitalize() not in ['Positive', 'Negative', 'Neutral']:
            output['overall_sentiment'] = 'Neutral'

        if output['customer_gender'].lower().capitalize() not in ['Male', 'Female', 'Unsure']:
            output['customer_gender'] = 'Unsure'

        if output['reason_type'].lower().capitalize() not in ['Enquiry', 'Complaint', 'Request']:
            output['reason_type'] = closest_match(['Enquiry', 'Complaint', 'Request'], output['reason_type'])

        if output['customer_type'].lower().capitalize() not in ['New', 'Existing', 'Unsure']:
            output['customer_type'] = 'Unsure'

        if output['end_of_call_status'].lower().capitalize() not in handled_list:
            output['end_of_call_status'] = closest_match(handled_list, output['end_of_call_status'])

        # Validate reasons based on type
        if output['reason_type'].lower().capitalize() == 'Complaint' and output['reason'] not in complaint_reasons:
            output['reason'] = closest_match(complaint_reasons, output['reason'])

        if output['reason_type'].lower().capitalize() == 'Enquiry' and output['reason'] not in enquiry_reasons:
            output['reason'] = closest_match(enquiry_reasons, output['reason'])

        if output['reason_type'].lower().capitalize() == 'Request' and output['reason'] not in request_reasons:
            output['reason'] = closest_match(request_reasons, output['reason'])

        return output

    except Exception as e:
        print(f"Error processing transcript: {str(e)}")
        return {
            "reason_type": "",
            "reason_verbatim": "",
            "reason": "",
            "end_of_call_status": "",
            "products_mentioned": [],
            "overall_sentiment": "",
            "emotions": [],
            "customer_type": "",
            "customer_gender": "",
            "summary": "",
            "transcript": ""
        }


def add_tree_nodes(output, master_outlet_id, transcript):
    """Add tree node classification based on master_outlet_id"""
    print(f"Adding tree nodes for master_outlet_id: {master_outlet_id}")

    if master_outlet_id == 321091:  # Sangeetha Gadgets
        print("Processing Sangeetha Gadgets tree nodes")
        response_format = {
            "products_mentioned": [
                {
                    "product_name": '',
                    "product_status": "one of two In Stock or Out of Stock"
                }
            ]
        }

        try:
            completion = client.chat.completions.create(
                model="openai/gpt-oss-20b",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that replies with exactly what is asked and in the same exact format every time."
                    },
                    {
                        "role": "user",
                        "content": f"""Using this audio to text between a customer and a brand that sells mobile phones and other electronics, i want you to populate the given json response format on the basis of these rules:

                    - Here are the products mentioned in the call: {[product['product'] for product in output['products_mentioned']] if output['products_mentioned'] != [] else []}.
                    - If the provided list is empty, return an empty string in the main key i.e. "products_mentioned"
                    - Cover all products given in the list even if it does not make sense, reply with in stock by default
                    - product_name key must have the name of the product
                    - product status key must have one of two values i.e. 'In Stock' or 'Out of Stock' depending on the context received from the audio to text. Use in_stock as the default condition if out of stock is not mentioned explicitly.

                    {transcript}

                    {response_format}"""
                                        }
                                    ],
                response_format={"type": "json_object"},
                max_completion_tokens=30000,
                reasoning_effort="low",
            )

            result = json.loads(completion.choices[0].message.content)

            if output['products_mentioned'] == [] or output['products_mentioned'][0]['product'] == '':
                output["l0_reason"] = 'Others'
                output["l1_reason"] = output['end_of_call_status']
                output["l2_reason"] = ""
                output["l3_reason"] = ""
            else:
                for item in result['products_mentioned']:
                    if item['product_status'] not in ['Out of Stock', 'In Stock']:
                        item['product_status'] = closest_match(['Out of Stock', 'In Stock'], item['product_status'])

                statuses = '--||--'.join(product_mention['product_status'] for product_mention in result['products_mentioned'])
                products = '--||--'.join(product_mention['product_name'] for product_mention in result['products_mentioned'])

                output['l0_reason'] = statuses
                output['l1_reason'] = products
                output['l2_reason'] = output['end_of_call_status']
                output['l3_reason'] = "" 

        except Exception as e:
            print(f"Error processing Sangeetha Gadgets completion: {str(e)}")
            output["l0_reason"] = ""
            output["l1_reason"] = ""
            output["l2_reason"] = ""
            output["l3_reason"] = ""

    elif master_outlet_id == 271756:  # HDFC ERGO
        print("Processing HDFC ERGO tree nodes")
        response_format = {
            "l0_reason": "",
            "l1_reason": ""
        }

        try:
            completion = client.chat.completions.create(
                model="openai/gpt-oss-20b",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that replies with exactly what is asked and in the same exact format every time."
                    },
                    {
                        "role": "user",
                        "content": f"""Using this audio to text between a customer and a brand Called HDFC ERGO that sells insurance of all kinds, i want you to populate the given json response format on the basis of these rules:

                        - The l0_reason field can have the following values: ["Customer Compared Competitor", "Agent Pitched Competitor", "No Competitor Mentioned"]
                        - The l1_reason field will be given a value based on the following: if the l0_reason field has a value that is one of the two: "Customer Compared Competitor", "Agent Pitched Competitor" then, the l1_reason field will have the name of the Competitor mentioned. The name can be picked from this list: ["ICICI Lombard", "Acko", "Star Health", "Care Insurance", "Niva Bupa", "Tata AIG", "GoDigit", "Reliance General Insurance"]
                        - if the l0_reason has the value "No Competitor Mentioned" then, the l1_reason will be an empty string.

                        {transcript}

                        {response_format}"""
                                            }
                                        ],
                response_format={"type": "json_object"},
                max_completion_tokens=30000,
                reasoning_effort="low",
            )

            result = json.loads(completion.choices[0].message.content)

            l0_reason = result.get("l0_reason", "")
            l1_reason = result.get("l1_reason", "")

            if l0_reason not in ["Customer Compared Competitor", "Agent Pitched Competitor", "No Competitor Mentioned"]:
                l0_reason = closest_match(["Customer Compared Competitor", "Agent Pitched Competitor", "No Competitor Mentioned"], l0_reason)

            if l0_reason in ["Customer Compared Competitor", "Agent Pitched Competitor"] and l1_reason not in ["ICICI Lombard", "Acko", "Star Health", "Care Insurance", "Niva Bupa", "Tata AIG", "GoDigit", "Reliance General Insurance"]:
                l1_reason = closest_match(["ICICI Lombard", "Acko", "Star Health", "Care Insurance", "Niva Bupa", "Tata AIG", "GoDigit", "Reliance General Insurance"], l1_reason)

            output["l0_reason"] = l0_reason
            output["l1_reason"] = l1_reason
            output["l2_reason"] = ""
            output["l3_reason"] = ""

        except Exception as e:
            print(f"Error processing ERGO analysis response: {e}")
            output["l0_reason"] = ""
            output["l1_reason"] = ""
            output["l2_reason"] = ""
            output["l3_reason"] = ""

    else:
        print(f"No specific tree node processing for master_outlet_id: {master_outlet_id}")
        output["l0_reason"] = ""
        output["l1_reason"] = ""
        output["l2_reason"] = ""
        output["l3_reason"] = ""

    return output


# Main processing function for notebook
def process_call(audio_path, brand_name, master_outlet_id,
                complaint_reasons, enquiry_reasons, request_reasons, handled_list):
    """
    Main function to process a call recording

    Parameters:
    - audio_path: path to audio file
    - brand_name: name of the brand
    - master_outlet_id: outlet identifier
    - complaint_reasons: list of valid complaint reasons
    - enquiry_reasons: list of valid enquiry reasons
    - request_reasons: list of valid request reasons
    - handled_list: list of valid handling statuses

    Returns:
    - Dictionary with full analysis results
    """

    # Step 1: Transcribe audio
    print("\n=== STEP 1: TRANSCRIPTION ===")
    transcript = transcribe_audio(audio_path, brand_name)
    print(f"Transcript: {transcript[:200]}...")

    # Step 2: Process transcript with LLM
    print("\n=== STEP 2: LLM ANALYSIS ===")
    output = process_transcript(
        transcript,
        brand_name,
        complaint_reasons,
        enquiry_reasons,
        request_reasons,
        handled_list,
        master_outlet_id
    )
    print(f"Analysis complete: {json.dumps(output, indent=2)}")

    # Step 3: Add tree nodes
    print("\n=== STEP 3: TREE NODES ===")
    final_output = add_tree_nodes(output, master_outlet_id, transcript)
    print(f"Tree nodes added: l0={final_output.get('l0_reason')}, l1={final_output.get('l1_reason')}")

    return final_output


# Example usage in notebook:
"""
# Define your parameters
brand_name = "Sangeetha Mobiles"
master_outlet_id = 321091
audio_path = "path/to/your/audio.mp3"

complaint_reasons = ["Product Issue", "Service Issue", "Billing Issue"]
enquiry_reasons = ["Product Availability", "Pricing", "Store Location"]
request_reasons = ["Order Status", "Return Request", "Technical Support"]
handled_list = ['Info Provided', 'Store Visit Confirmed', 'Issue Resolved', 'Call Dropped']

# Process the call
result = process_call(
    audio_path=audio_path,
    brand_name=brand_name,
    master_outlet_id=master_outlet_id,
    complaint_reasons=complaint_reasons,
    enquiry_reasons=enquiry_reasons,
    request_reasons=request_reasons,
    handled_list=handled_list
)

# View results
print(json.dumps(result, indent=2))
"""

'\n# Define your parameters\nbrand_name = "Sangeetha Mobiles"\nmaster_outlet_id = 321091\naudio_path = "path/to/your/audio.mp3"\n\ncomplaint_reasons = ["Product Issue", "Service Issue", "Billing Issue"]\nenquiry_reasons = ["Product Availability", "Pricing", "Store Location"]\nrequest_reasons = ["Order Status", "Return Request", "Technical Support"]\nhandled_list = [\'Info Provided\', \'Store Visit Confirmed\', \'Issue Resolved\', \'Call Dropped\']\n\n# Process the call\nresult = process_call(\n    audio_path=audio_path,\n    brand_name=brand_name,\n    master_outlet_id=master_outlet_id,\n    complaint_reasons=complaint_reasons,\n    enquiry_reasons=enquiry_reasons,\n    request_reasons=request_reasons,\n    handled_list=handled_list\n)\n\n# View results\nprint(json.dumps(result, indent=2))\n'