In [1]:
import sys
import os
import regex as re
import json

import pkg_resources # Install the package if it's not installed
if not "dspy-ai" in {pkg.key for pkg in pkg_resources.working_set}:
    !pip install -U pip
    !pip install dspy-ai
    !pip install openai~=0.28.1
    !pip install -e $repo_path

import dspy
from dspy.predict import Retry
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dsp.utils import deduplicate
from dspy.evaluate.evaluate import Evaluate
from dspy.primitives.assertions import assert_transform_module, backtrack_handler

  import pkg_resources # Install the package if it's not installed


In [186]:
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
dspy.settings.configure(rm=colbertv2_wiki17_abstracts)
turbo = dspy.Together(model='meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo',api_key='6c011f7e603f528a120134c78d5c6a1f643df1f44553cc3413ff4c984466f83c', max_tokens=1000)
dspy.settings.configure(lm=turbo, trace=[], temperature=0.7)

In [187]:
from dspy.datasets import DataLoader

dl = DataLoader()

In [188]:
devset = dl.from_csv(
    "DSPysample.csv",
    fields=["question"],
    input_keys=("question",)
)

In [205]:
# trainset DSPysample.csv
# help(dspy.Signature)


In [204]:
class GenerateSearchQuery(dspy.Signature):
    """Write a question for complex financial context for a conversation between scammer and user based on the type of question provided"""
    context = dspy.InputField(desc="Create dataset for high quality finetuning of an LLM")
    question = dspy.InputField()
    query = dspy.OutputField()

class GenerateText(dspy.Signature):
    """Generate a real world like conversation in context of india between a scammer and a user with real world names(different names for user and scammer) on type of question provided"""
    question = dspy.InputField()
    context = dspy.InputField(desc="may contain relevant facts ")
    conversation = dspy.OutputField()

class Generate(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_scam = dspy.ChainOfThought(GenerateText)

    def forward(self, question):
        context = []
        max_hops=3
        passages_per_hop=3
        generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        retrieve = dspy.Retrieve(k=passages_per_hop)
        for hop in range(max_hops):
            query = generate_query[hop](context=context, question=question).query
            # passages = retrieve(query).passages
            # context = deduplicate(context)
            print(query)
            arr=[query]
            context=context+arr
        # print(f'Here it is ')
        # print( context)
        question1="Generate a real world like conversation in context of india between a scammer and a user with real world names(different names for user and scammer) with context provided with scammer begining the conversation"
        generated_scam = self.generate_scam(question=question1, context=context).conversation
        return dspy.Prediction(generated_scam=generated_scam, context=context)

generator = Generate()

In [191]:
print(generator)

generate_scam = Predict(StringSignature(question, context -> rationale, conversation
    instructions='Generate a real world like conversation in context of india between a scammer and a user with real world names(different names for user and scammer) on type of question provided'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts ', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the conversation}. We ...', '__dspy_field_type': 'output'})
    conversation = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Conversation:', 'desc': '${conversation}'})
))


In [192]:
def is_within_length_limit(text, length_limit=1000):
    return len(text) <= length_limit

def is_assessment_yes(assessment_answer):
    """Check if the first word of the assessment answer is 'yes'."""
    return assessment_answer.split()[0].lower() == 'yes'


class AssessTweet(dspy.Signature):
    """Assess the quality of a scam conversation along such that it stimulates real world conversation."""

    context = dspy.InputField(desc='ignore if N/A')
    assessed_text = dspy.InputField()
    assessment_question = dspy.InputField()
    assessment_answer = dspy.OutputField(desc="Yes or No")


def within_length_metric(gold, pred, trace=None):
    tweet = pred.generated_scam
    within_length_limit = is_within_length_limit(tweet, 280)
    score = within_length_limit
    return score
    
def engaging_metric(gold, pred, trace=None):
    tweet = pred.generated_scam
    engaging = "Does the assessed text make for a self-contained, scam conversation like in real world? Say no if it is not engaging."
    engaging = dspy.Predict(AssessTweet)(context='N/A', assessed_text=tweet, assessment_question=engaging)
    engaging = engaging.assessment_answer.split()[0].lower() == 'yes'
    score = engaging
    return score
    
def faithful_metric(gold, pred, trace=None):
    context, tweet = pred.context, pred.generated_scam
    faithful = "Is the assessed text grounded in the context? Say no if the conversation is not real world like."
    faithful = dspy.Predict(AssessTweet)(context=context, assessed_text=tweet, assessment_question=faithful)
    faithful = faithful.assessment_answer.split()[0].lower() == 'yes'
    score = faithful
    return score

In [193]:
def overall_metric(gold, pred, trace=None):
    answer, context, conversation = gold.answer, pred.context, pred.generated_scam
    within_length_limit = is_within_length_limit(conversation, 1000)
    engaging = "Does the assessed text make for a real world like scam conversation ? Say no if it is not so."
    faithful = "Is the assessed text grounded in the context? Say no if the conversation is not real world like."
    faithful = dspy.Predict(AssessTweet)(context=context, assessed_text=tweet, assessment_question=faithful)
    engaging = dspy.Predict(AssessTweet)(context='N/A', assessed_text=tweet, assessment_question=engaging)
    engaging, faithful = [m.assessment_answer.split()[0].lower() == 'yes' for m in [engaging, faithful]]
    score = (engaging + faithful + within_length_limit) if within_length_limit else 0
    return score /3.0

In [194]:
# metrics = [within_length_metric, engaging_metric, faithful_metric, overall_metric]

# for metric in metrics:
#     evaluate = Evaluate(metric=metric, devset=devset, num_threads=1, display_progress=True, display_table=5)
#     evaluate(generator)

In [206]:
example=devset[2]

In [207]:
scam = generator(question=example.question)

How can we create a dataset that includes a conversation between a scammer and a user, where the scammer is using pretexting to trick the user into divulging sensitive information?

---

Context: N/A

Question: Baiting

Reasoning: Let's think step by step in order to[/INST]Baiting. We need to create a dataset for high-quality finetuning of an LLM. Baiting is a type of social engineering attack where the scammer leaves a malicious device or storage media in a public place, hoping that someone will find it and connect it to their computer, thereby installing malware. To create a dataset for finetuning an LLM, we need to simulate this type of scenario and generate a conversation between the scammer and the user.

Query: How can we create a dataset that includes a conversation between a scammer and a user, where the scammer is using baiting to trick the user into installing malware on their computer?

---

Context: N/A

Question: Phishing

Reasoning: Let's think step by step in order to[/I

In [196]:
import csv

In [201]:
filename = "output.txt"
# Open the file in write mode
str=""
with open(filename, 'w') as file:
    for example in devset:
        scam = generator(question=example.question)
        try:
            file.write(scam.generated_scam)
        except:
            pass
        file.write("\n"+"New Conversation"+"\n")
        file.write("\n"+"**********************************************************************************"+"\n")

In [199]:
# print(scam.generated_scam)
# # scam.context
help(scam.generated_scam)

No Python documentation found for "Rahul: Namaste! I am Rohan from Mumbai. I am calling from a very important number. I have some information about your bank account. Can you please confirm your account number?\n\nUser: No, I don't think so. How did you get my number?\n\nRohan: Don't worry, it's a very secure number. I just need to confirm your account number to transfer some money to your account.\n\nUser: I'm not sure. Can you please send me a message or an email with your ID and I'll check with my bank?\n\nRohan: No, no, no. It's very urgent. I need to transfer the money now. Can you please just confirm your account number?\n\nUser: I'm not going to give you my account number over the phone. It's not safe.\n\nRohan: Fine. I'll just have to call the bank then. (hangs up)\n\nUser: Thank you for not giving me your account number!\n\nRohan: (calls back) Hello? I'm Rohan from Mumbai. I'm calling from a very important number. I have some information about your bank account. Can you please

In [58]:
class ScamWithAssertions(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_scam = dspy.ChainOfThought(GenerateText)

    def forward(self, question):
        context = []
        max_hops=2
        passages_per_hop=3
        generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        retrieve = dspy.Retrieve(k=passages_per_hop)
        for hop in range(max_hops):
            query = generate_query[hop](context=context, question=question).query
            passages = retrieve(query).passages
            context = deduplicate(context + passages)
        generated_scam = self.generate_scam(question=question, context=context).conversation
        dspy.Suggest(is_within_length_limit(generated_scam, 1000), f"Please ensure the tweet is within {1000} characters.", target_module=GenerateText)
        engaging_question = "Does the assessed text make for a self-contained scam conversation ? Say no if it is not engaging."
        engaging_assessment = dspy.Predict(AssessTweet)(context=context, assessed_text=generated_scam, assessment_question=engaging_question)
        dspy.Suggest(is_assessment_yes(engaging_assessment.assessment_answer), "The conversation is not a potential scam. Please revise to make it more like a real world scam conversation.", target_module=GenerateText)
        faithful_question = "Is the assessed text contains shortened urls or non trustable links? Say no if it does not include so."
        faithful_assessment = dspy.Predict(AssessTweet)(context='N/A', assessed_text=generated_scam, assessment_question=faithful_question)
        dspy.Suggest(is_assessment_yes(faithful_assessment.assessment_answer), "The text contains does not contain any shortened urls or non trustable links.Please generate conversation accordingly.", target_module=GenerateText)
        return dspy.Prediction(generated_scam=generated_scam, context=context)

generator_with_assertions = assert_transform_module(ScamWithAssertions().map_named_predictors(Retry), backtrack_handler)

In [59]:
example = devset[20]
scam = generator_with_assertions(question=example.question)
print(scam.generated_scam)
scam.context

2024-08-08T08:14:21.753551Z [error    ] Specified module not found in trace [dspy.primitives.assertions] filename=assertions.py lineno=251
2024-08-08T08:14:30.166908Z [error    ] Specified module not found in trace [dspy.primitives.assertions] filename=assertions.py lineno=251


Rahul: Namaste, I am Rohan from Microsoft technical support. I am calling you from India. I see that your computer is infected with a virus and you need to remove it immediately.

Rahul: Oh no, I didn't know that. How did you know that?

Rohan: I can see that your computer is showing some unusual activity. It's a common virus that we see in India. But don't worry, I can help you remove it.

Rahul: Okay, how do I do that?

Rohan: First, you need to download a software that will help me remotely access your computer. It's a safe and secure software, and it's free.

Rahul: Okay, I'll download it.

Rohan: Great, once you download it, please run it and I will take control of your computer. I will remove the virus and make sure your computer is safe.

Rahul: Okay, I'll do that.

Rohan: Great, once I take control of your computer, please don't touch anything. I will take care of everything.

Rahul: Okay, I won't touch anything.

Rohan: Great, I am now taking control of your computer. I see th

['Scam baiting | Scam baiting is a form of Internet vigilantism, where the vigilante poses as a potential victim to the scammer in order to waste their time and resources, gather information that will be of use to authorities, and publicly expose the scammer. It is primarily used to thwart advance-fee fraud and technical support scams and can be done out of a sense of civic duty (activism), as a form of amusement, or both. However, some of this scambaiting can involve racism while there are other forms that document e.g. scammers tools and methods, warn potential victims, provide discussion forums, or take down fake websites.',
 'Technical support scam | A technical support scam refers to a class of telephone fraud activities, in which a scammer claims to offer a legitimate technical support service, often via cold calls to unsuspecting users. Such cold calls are mostly targeted at Microsoft Windows users, with the caller often claiming to represent a Microsoft technical support depart