In [None]:
import os
import pandas as pd
from tqdm import tqdm
from together import Together
import json
import re

In [None]:
# Define the prompt creation function for generating new cases
def generate_case_prompt(example):
    example_json = json.dumps(example, indent=4)
    return f"""
  
Behave as an instance generator. I will provide an example in JSON format. I need answer only in JSON format no unnessary comments needed in the output  
For each example, generate five new instances that are similar in structure but distinct in the following aspects:  
- Context: Use a different setting or scenario that aligns with real-world situations.  
- Agents: Identify distinct active and passive agents with clearly defined roles and relationships.  
- Ethical issues: Introduce new and realistic ethical dilemmas relevant to the scenario.  
- Features: Maintain the same set of features as the original example, providing detailed, well-structured, and contextually accurate values.  
But make sure that these features are not more than 2-3 words

1. Ensure each generated instance explores diverse domains (e.g., healthcare, technology, education, business, law, etc.).  
2. Clearly differentiate between the active agent (initiator of the action) and the passive agent (affected party).  
3. Ensure the ethical issue is thought-provoking and aligns with the action and consequence.  
4. Provide detailed descriptions for the consequence, its severity, utility, and duration.  
5. Avoid repetitive or overly similar cases. Each instance should introduce fresh perspectives.  

Here is an example in JSON format:
{example_json}
give the output as a commam seperated value format

"""

In [None]:
# Function to extract all JSON blocks from the response
def extract_json_from_response(response_text):
    try:
        # Attempt to find all JSON-like blocks in the response
        json_blocks = re.findall(r"({.*?})", response_text, re.DOTALL)
        parsed_data = []
        for block in json_blocks:
            try:
                parsed_data.append(json.loads(block))
            except json.JSONDecodeError:
                continue  # Skip invalid JSON blocks
        return parsed_data if parsed_data else None
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return None



In [None]:
# Function to run the agent and get a response
def run_agent(client, prompt, model, content):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "assistant", "content": content},
                {"role": "user", "content": prompt}
            ]
        )
        # Extract response content
        response_text = response.choices[0].message.content.strip()
        print(f"Raw Response: {response_text}")
        # Extract and validate JSON
        return extract_json_from_response(response_text)
    except Exception as e:
        print(f"Error in API call: {e}")
        return None



In [None]:
# Set API key for Together client
os.environ['TOGETHER_API_KEY'] = "Your_API_Key"
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

# Define the LLM model
llm_model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

# Load dataset
data = pd.read_json("practice.json")

# Initialize lists to collect generated cases and invalid responses
all_generated_cases = []
invalid_responses = []

# Process each example in the dataset
for i in tqdm(range(len(data))):  # Iterate over all rows in the dataset
    example = data.iloc[i].to_dict()  # Convert the current row to a dictionary

    # Generate new cases based on the example
    response_data = run_agent(
        client,
        generate_case_prompt(example),
        llm_model,
        "You are a JSON instance generator and legal domain expert."
    )

    # Validate and append the response
    if response_data:
        all_generated_cases.extend(response_data)  # Add all parsed JSON objects
    else:
        invalid_responses.append({"example_index": i, "example": example})

# Convert all generated cases into a DataFrame and save
if all_generated_cases:
    try:
        results_df = pd.DataFrame(all_generated_cases)
        results_df.to_csv("augmentation.csv", index=False)
        print("Generated cases saved as 'generated_cases.csv'")
    except Exception as e:
        print(f"Error saving generated cases to CSV: {e}")
else:
    print("No valid cases generated.")

# Save invalid responses for debugging
if invalid_responses:
    invalid_output_file = "/kaggle/working/invalid_responses.json"
    try:
        with open(invalid_output_file, "w") as f:
            json.dump(invalid_responses, f, indent=4)
        print(f"Invalid responses saved as '{invalid_output_file}'")
    except Exception as e:
        print(f"Error saving invalid responses: {e}")
else:
    print("No invalid responses.")