# Embedding data

In [None]:
import google.generativeai as genai
import pandas as pd
import gdown
import json
import time
import math

In [None]:
output = "gdf_public_impact.csv"
url = "https://drive.google.com/uc?id=1TWaXhd9-3PqjusF3lgyA_UKI2qwOE7mU"
gdown.download(url, output, quiet=False)
cleaned_df = pd.read_csv(output)
cleaned_df.head(5)

In [None]:
# 1. Configure API
GOOGLE_API_KEY = "YOUR_API_KEY"     # Insert your API key
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.5-flash')

# 2. Prepare sample data (random 10,000 rows)
TARGET_ROWS = 10000
BATCH_SIZE = 100

print(f"Sampling {TARGET_ROWS} rows...")

# Ensure enough rows exist
if len(cleaned_df) > TARGET_ROWS:
    sample_df = cleaned_df.sample(n=TARGET_ROWS, random_state=42).reset_index(drop=True)
else:
    sample_df = cleaned_df.copy().reset_index(drop=True)

print(f"Sampled {len(sample_df)} rows. Ready to send in batches of {BATCH_SIZE}.")


In [None]:
# 3. Batch prompt generation and API call
def analyze_batch(batch_data):
    # Convert batch input to JSON string
    input_json = json.dumps(batch_data, ensure_ascii=False)

    # Prompt template
    prompt = f"""
    You are an AI assistant evaluating complaint reports (Traffy Fondue).
    The output will be used as reference labels for training a machine-learning model.

    Task:
    Analyze the following comments (input is a JSON array):
    {input_json}

    Use the attached CSV reference (Risk Keywords) as guidance:
    1. Identify whether the comment contains any keywords from the list.
    2. Use the risk_score associated with matched keywords as the base score, but adjust based on context.
    3. Assign an urgency score from 0 to 10.
    4. Urgency levels: 0–3 = Low, 4–7 = Medium, 8–10 = High.
    5. Issues that are commonly reported or long-standing should be capped at a maximum of 7 (Medium).
    6. Scores 8–10 should be reserved only for severe, high-risk issues strongly aligned with risk keywords.
    7. If the report is largely speculative (e.g., contains phrases like "might", "maybe"), prioritize objective causes.

    Example guideline:
    - High: Life/property danger, fire, collapse, severe hazards, large-scale impact.
    - Medium: Moderate flooding, heavy traffic, damaged road, streetlight outage.
    - Low: General issues, cleanliness, inquiries, non-critical reports.

    Output JSON only:
    {{
        "found_keywords": ["keyword1", "keyword2"],
        "urgency": "High" | "Medium" | "Low",
        "score": 0–10
    }}
    """

    try:
        response = model.generate_content(prompt)
        clean_text = (
            response.text
            .replace("```json", "")
            .replace("```", "")
            .strip()
        )
        return json.loads(clean_text)

    except Exception as e:
        print(f"Error: {e}")
        try:
            print(f"Server response preview: {response.text[:100]}")
        except:
            print("No response text available.")
        return []


In [None]:
# 4. Batch API loop
results = []
total_batches = math.ceil(len(sample_df) / BATCH_SIZE)

print(f"Starting batch processing: {total_batches} batches (rate limit: 10 RPM)")

for i in range(0, len(sample_df), BATCH_SIZE):

    # Slice a batch of rows
    batch = sample_df.iloc[i : i + BATCH_SIZE]

    # Prepare input (use only ID and comment text)
    batch_input = [
        {"id": row["ticket_id"], "text": row["comment"]}
        for _, row in batch.iterrows()
    ]

    # Call API
    batch_result = analyze_batch(batch_input)

    # Match results back to original rows
    if batch_result:
        for res in batch_result:
            original_row = batch[batch["ticket_id"] == res.get("id")]

            if not original_row.empty:
                results.append({
                    "ticket_id": res.get("id"),
                    "comment": original_row.iloc[0]["comment"],
                    "ai_urgency": res.get("urgency"),
                    "ai_score": res.get("score"),
                })

    print(f"Batch {i // BATCH_SIZE + 1}/{total_batches} completed.")

    # Delay to respect rate limit (10 requests per minute)
    # 10 RPM → 1 request every 6 seconds; allow 7 seconds for safety
    time.sleep(2)

# 5. Save result
final_df = pd.DataFrame(results)
final_df.to_csv("traffy_gemini_batch.csv", index=False, encoding="utf-8-sig")

print(f"Completed. Total processed rows: {len(final_df)}")


In [None]:
final_df.dtypes

In [None]:
# Configure pandas to display full text without truncation
pd.set_option('display.max_colwidth', None)

# Ensure all columns are shown when printing DataFrames
pd.set_option('display.max_columns', None)

# Preview the first 20 rows
final_df.head(20)


# File Merge

In [None]:
import glob
import os

folder = "data"

# Read all CSV files matching the pattern "batch_*.csv"
files = glob.glob("batch_*.csv")

# Load and concatenate all files
df_list = [pd.read_csv(f) for f in files]
full_df = pd.concat(df_list, ignore_index=True)  # Reindex after concatenation

print(f"Merge completed. Total rows: {len(full_df)}")

# Save final merged dataset
full_df.to_csv(os.path.join(folder, "gemini_score_data.csv"), index=False)
