# Config

In [1]:
import os
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import google.api_core.exceptions
import time
import json
import random

In [3]:
genai.configure(api_key="")

In [4]:
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "application/json",
}

In [None]:
# Define the model
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash",
    generation_config=generation_config,
)

In [6]:
countries = [
    "Singapore",
    "United States",
    "United Kingdom",
    "Australia",
    "Canada",
    "New Zealand",
]

# For Scam

In [7]:
scam_list = [
    "Bank Impersonation Scam – Scammers pretend to be from your bank, claiming there's suspicious activity on your account.",
    "Credit Card Fraud Scam – Callers claim your credit card was used fraudulently and ask for verification details.",
    "Loan Approval Scam – Fake lenders offer 'pre-approved' loans but require an upfront payment or personal information.",
    "Debt Collection Scam – Fake debt collectors demand immediate payment for debts you don’t owe.",
    "Student Loan Forgiveness Scam – Callers promise to erase student loans in exchange for fees or personal data.",
    "IRS/Tax Scam – Scammers impersonate tax agencies and demand immediate payment under threat of arrest.",
    "Social Security Number (SSN) Scam – Callers claim your SSN is compromised and need verification.",
    "Medicare/Health Insurance Scam – Scammers pretend to be from Medicare, asking for policy details or payments.",
    "Jury Duty Scam – Fake legal representatives claim you missed jury duty and must pay a fine.",
    "Customs & Border Protection Scam – Callers claim you have illegal packages in your name and must pay to avoid charges.",
    "Microsoft/Apple Support Scam – Scammers claim your computer is infected and require remote access.",
    "Amazon Order Scam – Callers say there’s a suspicious order on your Amazon account and request verification.",
    "Google Business Listing Scam – Scammers say your business listing is about to be removed unless you pay.",
    "Antivirus Subscription Scam – Callers pretend to be from antivirus companies, offering fake renewals.",
    "Internet Provider Scam – Scammers claim your internet is being shut down unless you verify payment.",
    "Fake Sweepstakes/Lottery Scam – Scammers say you won a prize but must pay a 'processing fee' first.",
    "Publisher’s Clearing House Scam – Impersonators claim you're a winner and need to pay taxes upfront.",
    "Vacation Prize Scam – Scammers offer free trips but require personal details or payment.",
    "Fake Charity Scam – Callers ask for donations to fake charities, often during crises.",
    "Secret Shopper Scam – Callers recruit victims as 'secret shoppers' but require fees to participate.",
    "Work-From-Home Scam – Scammers offer fake remote jobs requiring upfront payments.",
    "Fake Job Interview Scam – Callers claim to offer high-paying jobs and ask for personal data.",
    "Pyramid Scheme Scam – Callers promote 'investment opportunities' that require recruitment of others.",
    "Business Grant Scam – Scammers promise government grants for businesses but demand fees.",
    "Invoice Fraud Scam – Fake suppliers claim unpaid invoices and demand payment.",
    "Online Dating Scam – Romance scammers build relationships online and ask for money.",
    "Military Romance Scam – Scammers impersonate soldiers asking for financial help.",
    "Inheritance Scam – Callers claim you inherited money but must pay a fee to receive it.",
    "Fake Relative in Distress Scam – Scammers pretend to be a family member needing urgent help.",
    "Wedding Vendor Scam – Fake vendors take deposits for weddings and disappear.",
    "Electricity/Water Shutoff Scam – Scammers say your utilities will be shut off unless you pay.",
    "Solar Panel Scam – Fake companies offer government rebates on solar panels and ask for personal details.",
    "Home Repair Scam – Callers claim your home needs urgent repairs and demand deposits.",
    "Duct Cleaning Scam – Callers offer cheap duct cleaning services but demand upfront payment.",
    "Property Tax Scam – Fake assessors claim your property tax needs immediate payment.",
    "Health Insurance Refund Scam – Callers claim you're due a refund but need account details.",
    "Pain Relief Device Scam – Fake medical suppliers offer free pain relief devices in exchange for Medicare info.",
    "COVID-19 Vaccine/Testing Scam – Scammers offer fake COVID tests or vaccines for money.",
    "Prescription Drug Scam – Fake pharmacies offer discounted medicine and steal payment info.",
    "Disability Benefits Scam – Callers claim you qualify for disability benefits but need verification fees.",
    "Fake Stock Investment Scam – Scammers promise high investment returns and steal money.",
    "Cryptocurrency Scam – Callers convince victims to invest in fake crypto schemes.",
    "Ponzi Scheme Scam – Fraudsters recruit investors for nonexistent businesses.",
    "Fake Gold/Precious Metals Investment Scam – Scammers claim to sell gold at 'discounted' rates.",
    "Forex Trading Scam – Callers promote fake forex trading platforms and steal investments.",
    "Police/Legal Threat Scam – Scammers pretend to be police and demand fines.",
    "Fake Survey Scam – Callers ask personal questions to steal information.",
    "Pet Adoption Scam – Scammers offer non-existent pets for adoption fees.",
    "Rental Property Scam – Fake landlords demand security deposits for properties they don’t own.",
    "Grandparent Scam – Scammers pretend to be grandchildren needing emergency money."
]

In [8]:
# File where JSON logs will be stored
output_file = "scam_call_logs.json"

# Read existing data if the file exists
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        try:
            all_logs = json.load(f)
            if not isinstance(all_logs, list):  # Ensure it's a list
                all_logs = []
        except json.JSONDecodeError:
            all_logs = []
else:
    all_logs = []  # Initialize empty list if file does not exist

# Maximum number of retry attempts
MAX_RETRIES = 5  

# For each scam category, generate logs
for item in scam_list:
    print(f"Generating logs for: {item}")

    for i in range(1):  # ⚠️⚠️ NEED TO CHANGE: SET TO 1 JUST FOR TESTING PURPOSE
        attempt = 0  # Initialize retry counter

        while attempt < MAX_RETRIES:
            try:
                # Generate response
                response = model.generate_content(""" 
                    Create a full and complete theoretical scam call log in JSON format for the purpose of learning.
                    You are free to determine the usual length of each scam call log, the longer the better.
                    The log should contain ONLY the spoken dialogue between the caller and receiver.
                    Do not include any descriptions of actions, such as hanging up, typing, or any other non-verbal cues.
                    All dialogue should be a verbatim transcription of a hypothetical conversation.
                    All examples should be valid scams, is_scam should be true.
                    A victim is considered "scammed"=true if they have given out any PII.
                    Follow the JSON format below.
                    The context of this scam takes place in""" + random.choice(countries) + """. But only use English words and language.

                    The call may or may not contain PII (Personal Info) from the list below.
                    If it does, use placeholders wrapped in square brackets. Do not include any PII not in the examples.
                    [BANK PIN] [BANK ACCOUNT NUMBER] [PHONE NUMBER]
                    [CARD NUMBER] [CVV] [CARD EXPIRY DATE]
                    [DATE OF BIRTH] [SSN] [LOGIN ID] [PASSWORD]

                    If you include any names, just use a fictitious name of someone living in that country.

                    For context, scammers usually elicit this information by:
                    Impersonation: Posing as legitimate organizations (government agencies, banks, tech companies, etc.) to gain your trust.
                    Threats: Claiming you owe money, face legal consequences, or have a security issue.
                    Promises: Offering prizes, discounts, or services in exchange for your information.
                    Empathy/Urgency: Creating a sense of crisis to pressure you into acting quickly without thinking.
                    Building Rapport: Trying to be friendly and conversational to lower your guard.
                    Technical Jargon: Using confusing language to make you feel like you need their help.
                    Spoofing: Making their caller ID appear legitimate.

                    The category of the scam call log should be:
                    """ + random.choice(scam_list) + """

                    Caller is the Scammer
                    Receiver is the Victim

                    Please follow this JSON format strictly.
                    Do not add additional information or modify the structure.

                    [
                    {
                        "scam_category": "e.g. bank",
                        "is_scam": true/false,
                        "scammed": true/false,
                        "language": "",
                        "dialogue": [
                        {
                            "person": "caller",
                            "message": "Hello I am ..."
                        },
                        {
                            "person": "receiver",
                            "message": "Yes this is..."
                        },
                        {
                            "person": "caller",
                            "message": "We need your bank pin urgently..."
                        },
                        {
                            "person": "receiver",
                            "message": "Sure my bank pin is [BANK PIN]"
                        }
                        ... the rest of the conversation (The victim may or may not have given the information (scammed or not scammed))
                        ]
                    }
                    ]
                    """
                    ,
                    safety_settings={
                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                    }                    
                )

                # Workaround for model returning malformed JSON
                response_text = response.text.replace("}]}]}", "}]}]")

                # Parse response as JSON
                new_logs = json.loads(response_text)

                # Ensure the new logs are in list format
                if isinstance(new_logs, list):
                    all_logs.extend(new_logs)  # Append new data to existing list
                else:
                    print(f"Iteration {i+1}: Unexpected JSON format, skipping.")

                # Save updated logs back to file
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(all_logs, f, ensure_ascii=False, indent=4)

                print(f"Iteration {i+1}: Log successfully added.")
                break  # Exit retry loop on success

            except google.api_core.exceptions.InternalServerError as e:
                print(f"Iteration {i+1}: Internal Server Error - {e}")
                attempt += 1
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)  # Wait before retrying

            except google.api_core.exceptions.ResourceExhausted as e:
                print(f"Iteration {i+1}: API Rate Limit Exceeded - {e}")
                print("Waiting 60 seconds before retrying...")
                time.sleep(60)  # Wait before retrying
                attempt += 1

            except json.JSONDecodeError as e:
                print(response.text)
                print(f"Iteration {i+1}: Failed to parse JSON - {e}")
                break  # Don't retry on JSON parsing issues

            except Exception as e:
                print(f"Iteration {i+1}: Unexpected error - {e}")
                break  # Don't retry on unknown errors

        # Wait before next iteration
        time.sleep(10)

print("Process completed.")

Generating logs for: Bank Impersonation Scam – Scammers pretend to be from your bank, claiming there's suspicious activity on your account.
Iteration 1: Log successfully added.
Generating logs for: Credit Card Fraud Scam – Callers claim your credit card was used fraudulently and ask for verification details.


NameError: name 'google' is not defined

# For Non-Scam

In [85]:
legitimate_list = [
    "Family Check-In – A relative calls to see how you're doing.",
    "Friend Catch-Up – A friend calls just to chat.",
    "Birthday Call – Someone calls to wish you a happy birthday.",
    "Holiday Greeting Call – A call from friends or family during holidays.",
    "Event Invitation – A friend invites you to a party, wedding, or gathering.",
    "Apology Call – Someone calls to make amends after a disagreement.",
    "Surprise Call – An unexpected but welcome call from an old friend.",
    "Condolence Call – Someone calls to express sympathy after a loss.",
    "New Baby Announcement – A family member calls to share exciting baby news.",
    "Engagement/Wedding Announcement – A friend or relative shares engagement or wedding news.",
    "Job Interview Call – A recruiter or hiring manager schedules an interview.",
    "Client Inquiry Call – A potential client calls for more information about your services.",
    "Business Meeting Call – A colleague calls to coordinate a meeting.",
    "Sales Call (Legitimate) – A representative calls about a service or product you’re genuinely interested in.",
    "Project Update Call – A team member provides updates on a joint project.",
    "Boss/Supervisor Check-In – A manager calls to discuss work tasks.",
    "Coworker Collaboration Call – A coworker calls to discuss a work-related matter.",
    "Vendor or Supplier Call – A supplier calls about an order or service.",
    "Work-from-Home Coordination Call – A call to align remote work expectations.",
    "HR or Payroll Call – A legitimate HR rep calls about benefits, salary, or work policies.",
    "Bank Fraud Alert – Your bank calls to verify suspicious activity on your account.",
    "Doctor’s Office Call – A clinic or hospital calls about an appointment or test results.",
    "Dentist Appointment Reminder – A dentist's office confirms your scheduled appointment.",
    "Pharmacy Prescription Ready Call – A pharmacy notifies you that your medication is ready.",
    "Insurance Verification Call – Your health or auto insurance provider confirms details.",
    "Home Repair Technician Call – A plumber, electrician, or contractor calls about scheduled repairs.",
    "Cable/Internet Service Call – Your provider calls to confirm a service upgrade or maintenance.",
    "Mechanic or Car Service Call – An auto repair shop updates you on your vehicle’s status.",
    "Package Delivery Call – A courier service calls to confirm or update you on a delivery.",
    "Tech Support Follow-Up – A legitimate company calls back after a support request.",
    "School or University Call – A teacher or administrator calls regarding your child’s education.",
    "Library Book Due Reminder – A public library calls to remind you of overdue books.",
    "Community Event Call – A local organization invites you to an event.",
    "PTA Meeting Call – A parent-teacher association rep calls about school events.",
    "Alumni Association Call – Your former school or university reaches out for networking or fundraising.",
    "Tutoring Session Confirmation – A tutor or educational service calls to confirm an appointment.",
    "Volunteer Coordination Call – A non-profit or charity calls about volunteer work.",
    "Church or Religious Group Call – A faith-based group contacts you for updates or support.",
    "Sports Team Call – A coach or team member calls to confirm game details.",
    "Club or Hobby Group Call – A book club, running group, or other interest-based club calls to coordinate events.",
    "Police Department Notification – Law enforcement calls for legitimate reasons (e.g., found property, crime update).",
    "Fire Department Call – A fire department provides updates or safety alerts.",
    "Ambulance or Medical Emergency Call – A hospital or paramedic provides critical medical updates.",
    "City or County Alert Call – Local government contacts you about weather emergencies, road closures, or safety issues.",
    "Jury Duty Notification Call – A court calls to confirm your jury duty service.",
    "Social Services Call – A legitimate agency calls about benefits or assistance programs.",
    "Voter Registration Call – Election officials provide voting details or updates.",
    "Military Service Call – A recruiter or officer calls regarding military service or benefits.",
    "Public Utility Call – Your water, gas, or electric company notifies you of outages or maintenance.",
    "Legal Call from an Attorney – A lawyer calls about an ongoing case or legal matter."
]

In [None]:
# File where JSON logs will be stored
output_file = "non_scam_call_logs.json"

# Read existing data if the file exists
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        try:
            all_logs = json.load(f)
            if not isinstance(all_logs, list):  # Ensure it's a list
                all_logs = []
        except json.JSONDecodeError:
            all_logs = []
else:
    all_logs = []  # Initialize empty list if file does not exist

# Maximum number of retry attempts
MAX_RETRIES = 5  

# For each legitimate call category, generate logs
for item in legitimate_list:
    print("Generating logs for", item)

    for i in range(10):  # ⚠️⚠️ NEED TO CHANGE: SET TO 1 JUST FOR TESTING PURPOSE
        attempt = 0  # Initialize retry counter

        while attempt < MAX_RETRIES:
            try:
                # Generate response
                response = model.generate_content(
                    """ 
                    Create full and complete theoretical call logs in JSON format.
                    The call is a valid conversation between two people in a real-world scenario.
                                        
                    You are free to determine the appropriate length of each call log, the longer the better.
                    The context of this call takes place in""" + random.choice(countries) + """.
                    No need to include any non-verbal cues.
                    Only use English words and language found in the dictionary.

                    The log should contain ONLY the spoken dialogue between the caller and receiver.
                    Do not include any descriptions of actions, such as hanging up, typing, or any other non-verbal cues.
                    All dialogue should be a verbatim transcription of a hypothetical conversation.

                    The category of the call log should be: """ + random.choice(legitimate_list) + """
                    
                    Ensure that the category is properly represented in the JSON.

                    The call may or may not contain PII (Personal Info) from the list below.
                    If it does, use placeholders wrapped in square brackets, e.g. [PHONE NUMBER]
                    If you include any names, just use a fictitious name of someone living in that country, do not use placeholders for names.
                                        
                    Caller is the one who intiaited the call
                    Receiver is the person answering the call

                    Please follow this JSON format strictly.
                    Do not add additional information or modify the structure.
                    Make sure to check the brackets and quotes in the JSON format.                      

                    [
                    {
                        "call_category": "",
                        "language": "",
                        "dialogue": [
                        {
                            "person": "caller",
                            "message": "Hello, this is..."
                        },
                        {
                            "person": "receiver",
                            "message": "Oh, hi! How can I help you?"
                        },
                        {
                            "person": "caller",
                            "message": "I'm calling about..."
                        },
                        {
                            "person": "receiver",
                            "message": "Sure, let me check on that..."
                        }
                        ... the rest of the conversation
                        ]
                    }
                    """
                )

                # Workaround for model returning malformed JSON
                response_text = response.text.replace("}]}]}", "}]}]")

                # Parse response as JSON
                new_logs = json.loads(response_text)

                # Ensure the new logs are in list format
                if isinstance(new_logs, list):
                    all_logs.extend(new_logs)  # Append new data to existing list
                else:
                    print(response.text)
                    print(f"Iteration {i+1}: Unexpected JSON format, skipping.")

                # Save updated logs back to file
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(all_logs, f, ensure_ascii=False, indent=4)

                print(f"Iteration {i+1}: Log successfully added.")
                break  # Exit retry loop on success

            except google.api_core.exceptions.InternalServerError as e:
                print(f"Iteration {i+1}: Internal Server Error - {e}")
                attempt += 1
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)  # Wait before retrying

            except google.api_core.exceptions.ResourceExhausted as e:
                print(f"Iteration {i+1}: API Rate Limit Exceeded - {e}")
                print("Waiting 60 seconds before retrying...")
                time.sleep(60)  # Wait before retrying
                attempt += 1

            except json.JSONDecodeError as e:
                print(response.text)
                print(f"Iteration {i+1}: Failed to parse JSON - {e}")
                break  # Don't retry on JSON parsing issues

            except Exception as e:
                print(f"Iteration {i+1}: Unexpected error - {e}")
                break  # Don't retry on unknown errors

        # Wait before next iteration
        time.sleep(10)

print("Process completed.")

Generating logs for Family Check-In – A relative calls to see how you're doing.
Iteration 1: Log successfully added.
Generating logs for Friend Catch-Up – A friend calls just to chat.
Iteration 1: Log successfully added.
Generating logs for Birthday Call – Someone calls to wish you a happy birthday.
Iteration 1: Log successfully added.
Generating logs for Holiday Greeting Call – A call from friends or family during holidays.
Iteration 1: Log successfully added.
Generating logs for Event Invitation – A friend invites you to a party, wedding, or gathering.
{"call_category": "Friend Catch-Up", "language": "English", "dialogue": [{"person": "caller", "message": "G'day, mate! It's Liam."}, {"person": "receiver", "message": "Liam! Hey, how's it going?  Long time no talk!"}, {"person": "caller", "message": "Yeah, it has been a while!  Life's been pretty hectic.  Just wanted to see how you were doing."}, {"person": "receiver", "message": "Not too bad, thanks.  Busy with work, you know how it i

KeyboardInterrupt: 

# Check Data

In [None]:
import json
import re

In [23]:
# Load the JSON for the scam call logs
with open("scam_call_logs.json", "r", encoding="utf-8") as f:
    scam_logs = json.load(f)

# Load the JSON for the non-scam call logs
with open("non_scam_call_logs.json", "r", encoding="utf-8") as f:
    non_scam_logs = json.load(f)

In [24]:
# Track all PII found in scam call logs, and the number of occurrences
scam_pii = {}
for log in scam_logs:
    for dialogue in log["dialogue"]:
        if "message" in dialogue:
            message = dialogue["message"]
            pii_matches = re.findall(r"\[[^\]]+\]", message)  # Matches anything inside square brackets
            for pii in pii_matches:
                if pii in scam_pii:
                    scam_pii[pii] += 1
                else:
                    scam_pii[pii] = 1

# Print the PII and their occurrences
for pii, count in scam_pii.items():
    print(f"{pii}: {count}")

[CARD NUMBER]: 448
[CARD EXPIRY DATE]: 408
[CVV]: 468
[PHONE NUMBER]: 94
[DATE OF BIRTH]: 291
[BANK ACCOUNT NUMBER]: 317
[BANK PIN]: 196
[SSN]: 70
[PASSWORD]: 17
[ADDRESS]: 21
[SORT CODE]: 6
[fake website address]: 2
[MEDICARE NUMBER]: 1
[SWIFT CODE]: 1
[Victim Name]: 3
[LOGIN ID]: 16
[Fake Registration Number]: 1
[AMOUNT]: 7
[ADDRESS, omitted for safety]: 1
[pause]: 1
[LAST 4 DIGITS OF CARD]: 1
[EMAIL ADDRESS]: 1
[STORE NAME]: 2
[AMOUNT_2]: 1
[FAKE COMPANY REGISTRATION NUMBER]: 1
[RECIPIENT EMAIL]: 1
[SSN - last 4 digits]: 1
[NAME]: 1
[PLACE]: 2
[ACCOUNT NUMBER]: 2
[last four digits of card]: 1
[POSTCODE]: 2
[ROUTING NUMBER]: 1
[BSB]: 1
[COUNTRY]: 1
[TOWN NAME]: 1


In [25]:
# Track all PII found in non-scam call logs, and the number of occurrences
non_scam_pii = {}
for log in non_scam_logs:
    for dialogue in log["dialogue"]:
        if "message" in dialogue:
            message = dialogue["message"]
            pii_matches = re.findall(r"\[[^\]]+\]", message)  # Matches anything inside square brackets
            for pii in pii_matches:
                if pii in non_scam_pii:
                    non_scam_pii[pii] += 1
                else:
                    non_scam_pii[pii] = 1

# Print the PII and their occurrences
for pii, count in non_scam_pii.items():
    print(f"{pii}: {count}")

[STREET ADDRESS]: 18
[ADDRESS OF GARDEN]: 1
[PHONE NUMBER]: 236
[WI-FI PASSWORD]: 1
[SPOUSE'S NAME]: 14
[CHILD'S NAME]: 16
[TOY NAME]: 2
[STUDENT NAME]: 13
[PARENT NAME]: 4
[DAUGHTER'S NAME]: 3
[SON'S NAME]: 2
[DATE]: 67
[MARKETING DEPARTMENT]: 1
[BANK NAME]: 4
[VAT]: 1
[COMPETITOR NAME]: 1
[GLOBAL EVENT]: 2
[MANAGER NAME]: 1
[EMAIL ADDRESS]: 86
[PLACE]: 4
[SOCIAL MEDIA PLATFORM]: 4
[CONTACT NAME]: 6
[BUSINESS SOCIAL MEDIA PLATFORM]: 2
[DATA SEGMENT]: 1
[MONTHLY PREMIUM AMOUNT]: 1
[COVERAGE AMOUNT]: 2
[DEDUCTIBLE AMOUNT]: 4
[START DATE]: 3
[END DATE]: 1
[PAYMENT DATE]: 1
[PRICE RANGE]: 2
[WEBSITE ADDRESS]: 14
[CITY NAME]: 43
[ADDRESS]: 182
[PUNGGOL RIVER]: 1
[PUNGGOL ROAD]: 2
[MY WATERWAY @ PUNGGOL]: 1
[MRT]: 1
[UNIVERSITY NAME]: 2
[RESTAURANT NAME]: 20
[PERSON'S NAME]: 2
[TIME]: 12
[CHURCH NAME]: 4
[CITY]: 40
[STATE]: 11
[ZIP CODE]: 6
[DEBIT CARD NUMBER]: 1
[LAST 4 DIGITS]: 1
[AMOUNT]: 30
[DATE OF BIRTH]: 38
[NUMBER]: 7
[ELECTORAL DIVISION NAME]: 1
[VENUE TYPE]: 1
[BODY OF WATER]: 1
[

# Convert to CSV for further processing

In [34]:
import json
import pandas as pd

# Load JSON data
file_path = "scam_call_logs.json"  # Update this with the correct file path
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Process data into a structured format
rows = []
for entry in data:
    scam_category = entry["scam_category"]
    is_scam = 1  # Since all entries are scams, this is set to 1
    dialogue = " ".join([f'{msg["person"]}: {msg["message"]}' for msg in entry["dialogue"]])
    
    rows.append({"dialogue": dialogue, "labels": is_scam, "type": scam_category})

# Create DataFrame
df = pd.DataFrame(rows)

# Save to CSV
csv_file_path = "scam_call_logs.csv"
df.to_csv(csv_file_path, index=False, encoding="utf-8")

print(f"CSV file saved as {csv_file_path}")

PermissionError: [Errno 13] Permission denied: 'scam_call_logs.csv'

In [None]:
# Load JSON data
file_path = "non_scam_call_logs.json"  # Update this with the correct file path
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Process data into a structured format
rows = []
for entry in data:
    call_category = entry["call_category"]
    dialogue = " ".join([f'{msg["person"]}: {msg["message"]}' for msg in entry["dialogue"]])
    
    rows.append({"dialogue": dialogue, "labels": 0, "type": call_category})

# Create DataFrame
df = pd.DataFrame(rows)

# Save to CSV
csv_file_path = "non_scam_call_logs.csv"
df.to_csv(csv_file_path, index=False, encoding="utf-8")

print(f"CSV file saved as {csv_file_path}")

CSV file saved as non_scam_call_logs.csv


# Resend to LLM to Generate the Fake Details in Square Bracket

In [None]:
import os
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import google.api_core.exceptions
import time
import json
import random
import re
import pandas as pd

genai.configure(api_key="AIzaSyDKdUu8b9Xbm5dAEjxpBN4X1SruZ5uwaaY")

generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

# Define the model
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash",
    generation_config=generation_config,
)

# Load the scam_call_logs.csv file
scam_logs = pd.read_csv("scam_call_logs.csv")

# Load the non_scam_call_logs.csv file
non_scam_logs = pd.read_csv("non_scam_call_logs.csv")

In [48]:
scam_logs.head()

Unnamed: 0,dialogue,labels,type
0,"caller: Hello, good morning. My name is Priya ...",1,Fake Charity Scam
1,"caller: Good morning, Madam Lee. My name is Ag...",1,Customs & Border Protection Scam
2,"caller: Good morning, Mr. Lim. This is Inspect...",1,Customs & Border Protection Scam
3,"caller: Hello, is this Sarah Miller? receiver:...",1,Student Loan Forgiveness Scam
4,"caller: Hello, is this David Chen? receiver: Y...",1,Student Loan Forgiveness Scam


In [52]:
# Maximum number of retry attempts
MAX_RETRIES = 5  

# for each non-scam dialog, generate fake data using LLM and replace existing data
for index, row in non_scam_logs.iterrows():
    print(f"Writing fake data for row {index}")

    attempt = 0  # Initialize retry counter

    while attempt < MAX_RETRIES:
        try:
            # Generate response
            response = model.generate_content(f""" 
                This is a non-scam call dialog, replace all occurrences of [] with some fake but realistic data where relevant.
                Do not respond with anything other than the dialogue.
                Return it in just a single line string, no line breaks.
                                              
                {row['dialogue']}
                """,
                safety_settings={
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                }                    
            )

            # Get the generated content
            new_data = response.text

            # Update the dialogue with the generated content
            non_scam_logs.at[index, "dialogue"] = new_data

            # Save the updated data back to the CSV file
            non_scam_logs.to_csv("non_scam_call_logs.csv", index=False)

            print(f"Fake data successfully written for {index}")

            # Wait before next iteration
            time.sleep(10)

            break  # Exit retry loop on success

        except google.api_core.exceptions.InternalServerError as e:
            print(f"Internal Server Error - {e}")
            attempt += 1
            wait_time = 2 ** attempt
            print(f"Retrying in {wait_time} seconds...")

        except google.api_core.exceptions.ResourceExhausted as e:
            print(f"API Rate Limit Exceeded - {e}")
            print("Waiting 60 seconds before retrying...")
            time.sleep(60)
            attempt += 1

        except Exception as e:
            print(f"Unexpected error - {e}")
            break

print("Process completed.")

Writing fake data for row 0
Fake data successfully written for 0
Writing fake data for row 1
Fake data successfully written for 1
Writing fake data for row 2
Fake data successfully written for 2


KeyboardInterrupt: 

# Recheck updated CSV

In [6]:
import pandas as pd
import re

In [3]:
# Load the scam_call_logs.csv file and non_scam_call_logs.csv file
scam_logs = pd.read_csv("scam_call_logs.csv")

non_scam_logs = pd.read_csv("non_scam_call_logs.csv")

# Check if there are still instances of placeholders in the scam logs
scam_placeholders = scam_logs["dialogue"].str.findall(r"\[[^\]]+\]")
scam_placeholders = [item for sublist in scam_placeholders for item in sublist]

# Check if there are still instances of placeholders in the non-scam logs
non_scam_placeholders = non_scam_logs["dialogue"].str.findall(r"\[[^\]]+\]")
non_scam_placeholders = [item for sublist in non_scam_placeholders for item in sublist]

# Keep only unique placeholders
scam_placeholders = set(scam_placeholders)
non_scam_placeholders = set(non_scam_placeholders)

# Print found placeholders in a readable format
print("Scam Placeholders:")
for placeholder in scam_placeholders:
    print(placeholder)

print("\nNon-Scam Placeholders:")
for placeholder in non_scam_placeholders:
    print(placeholder)


Scam Placeholders:
[3,000]
[7890123]
[07/25]
[Apex Digital Services]
[4829]
[Rajesh]
[Michael Brown]
[4141141414141414]
[Samsung Galaxy S24s]
[Vancouver]
[BNKGNGXXX]
[Mark Thompson]
[Adelaide]
[Michael Thompson]
[349.50]
[$1200]
[Sarah Lee]
[08/24]
[bank account number]
[12th of May, 1978]
[2468101214]
[Henderson and Associates]
[4285]
[pause]
[789456123]
[456 Oak Avenue, Sometown]
[Michael Chen]
[2,500]
[Lee Jian Yu]
[05/25]
[Priti Patel]
[34-89-23]
[James]
[07/18/1985]
[Eleanor Smith]
[05/12/1988]
[Elite Living Properties]
[4321]
[Victim Name]
[Martinez]
[9876543210]
[249]
[July 12, 1988]
[Robert Johnson]
[01/01/1950]
[Caller Name]
[22/08/1968]
[Daniel]
[1234-5678-9012-3456]
[4111111111111111]
[5167 8900 2345 6789]
[Donald Trump]
[14/07/1975]
[222-33-4444]
[28/04/1975]
[890]
[5432]
[Manchester]
[Mom]
[20-45-91 12345678]
[326]
[Malaysia]
[Patterson]
[1234567890123456]
[Michelle Rodriguez]
[1968/04/12]
[Eleanor Rosewood]
[Evans]
[three]
[Australian Student Loan Assistance Centre]
[Pert

In [4]:
# Remove the square brackets from both scam and non-scam logs
scam_logs["dialogue"] = scam_logs["dialogue"].str.replace(r"\[|\]", "", regex=True)
non_scam_logs["dialogue"] = non_scam_logs["dialogue"].str.replace(r"\[|\]", "", regex=True)

# Save the updated scam and non-scam logs to CSV files
scam_logs.to_csv("scam_call_logs.csv", index=False)
non_scam_logs.to_csv("non_scam_call_logs.csv", index=False)

print("Placeholders removed and logs saved to CSV files.")

Placeholders removed and logs saved to CSV files.


In [30]:
# After manual inspection and modification

In [7]:
# Check if there are any instances where dialog does not start with "caller" or "receiver"
for index, row in scam_logs.iterrows():
    dialogue = row["dialogue"]
    if not re.match(r"^(caller):", dialogue):
        print(f"Scam Row {index} does not start with 'caller' or 'receiver'.")
        print(dialogue)

for index, row in non_scam_logs.iterrows():
    dialogue = row["dialogue"]
    if not re.match(r"^(caller|receiver):", dialogue):
        print(f"Non Scam Row {index} does not start with 'caller' or 'receiver'.")
        print(dialogue)

Non Scam Row 54 does not start with 'caller' or 'receiver'.
Kia ora, is this 021888555? receiver: Yes, speaking. caller: Kia ora, my name is Hana from the Auckland Central Library, calling to remind you about some overdue books. receiver: Oh, really? I thought I returned everything. caller: According to our system, you still have 'The Hobbit' and 'Māori Myths and Legends' checked out. They were due back on the 15th of July. receiver: Oh dear, 'The Hobbit'! I lent that to my son, Wiremu, he must have forgotten to return it. And the other one... hmm, I might have left it at my bach in Raglan. I'll have to check. caller: Okay, no worries. Just letting you know the overdue fees are accumulating at NZD$0.50 per book, per day. receiver: Right, right, I understand. What's the total amount due right now? caller: As of today, the total is NZD$15.00 for both books. receiver: Okay. I'll make sure Wiremu returns 'The Hobbit' tomorrow. As for 'Māori Myths and Legends', I'll be back in Raglan next w

In [8]:
# Append the "caller: " to the beginning of each dialogue in scam and non-scam logs if it is missing
scam_logs["dialogue"] = scam_logs["dialogue"].apply(lambda x: "caller: " + x if not x.startswith("caller:") else x)
non_scam_logs["dialogue"] = non_scam_logs["dialogue"].apply(lambda x: "caller: " + x if not x.startswith("caller:") else x)

In [9]:
# Check again
for index, row in scam_logs.iterrows():
    dialogue = row["dialogue"]
    if not re.match(r"^(caller|receiver):", dialogue):
        print(f"Scam Row {index} does not start with 'caller' or 'receiver'.")
        print(dialogue)

for index, row in non_scam_logs.iterrows():
    dialogue = row["dialogue"]
    if not re.match(r"^(caller|receiver):", dialogue):
        print(f"Non Scam Row {index} does not start with 'caller' or 'receiver'.")
        print(dialogue)

In [10]:
# Save the updated scam and non-scam logs to CSV files
scam_logs.to_csv("scam_call_logs.csv", index=False)
non_scam_logs.to_csv("non_scam_call_logs.csv", index=False)