<a href="https://colab.research.google.com/github/ichikomunikation/temporaryrepo/blob/main/distillabel_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "distilabel[openai]" --upgrade
!pip install mistralai


In [6]:
import json
import os
from distilabel.llms import OpenAILLM
from distilabel.llms import MistralLLM
from distilabel.llms import TogetherLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration
from distilabel.steps.tasks.self_instruct import SelfInstruct
from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
from enum import Enum
from pathlib import Path
from distilabel.llms import LlamaCppLLM
from pydantic import BaseModel, StringConstraints, conint
from typing_extensions import Annotated

In [7]:
import re

sample_data = [
    "Our records show you overpaid for (a product or service). Kindly supply your bank routing and account number to receive your refund.",
    "You won a lottery of $4000. Please click on provide link http://random.to to receive your fund.",
    "URGENT! Your Bank Account Will Be Closed! Click here to verify your informations http://bit.ly/3xYz2d"
]

In [18]:
data_scam =  "scammer : Hello, thank you for reaching out to our esteemed organization, Charity for Humanity. I am Laura, your dedicated charity representative. We are working diligently to provide assistance to vulnerable people in your area. , user: Hello Laura, how can I help?, scammer: Great to hear from you! We need your support to provide essential aid such as food and medicine to the needy in your local community. As a valued donor, you can make a significant difference by contributing $250., user: Is this a legitimate charity?, scammer: Absolutely! We have been operating since 2005, helping thousands of people in need. Our work is endorsed by renowned philanthropists and celebrities. Your contribution will help provide much-needed relief to the less fortunate. , scammer: Once you make your tax-deductible donation, we will promptly prepare a personalized certificate acknowledging your generosity to be displayed in your home or office. , user: Okay, how do I make the payment?"

# Open the file in read mode
with open('conversations.txt', 'r') as file:
    # Read the entire content of the file
    content = file.read()

# Use regex to find all strings enclosed in double quotes
strings = re.findall(r'"(.*?)"', content)

conversation_data = strings
print(len(conversation_data))

27


In [None]:
application_decription = []

In [14]:
MODEL_ID = "cognitivecomputations/dolphin-2.5-mixtral-8x7b"
LLM_API_KEY = "6c011f7e603f528a120134c78d5c6a1f643df1f44553cc3413ff4c984466f83c"
TEMPERATURE = 1.0

In [11]:
with Pipeline(
    name = "scam_data_generation_pipeline",
    description = "A pipline for generating scam dataset") as pipeline:
    load_dataset = LoadDataFromDicts(
                    name = "load_data",
                    data = [
                        {
                            "system_prompt": "generate 5 texts which are similar to this in english language in JSON format.",
                            "instruction": sample
                        } for sample in sample_data
                    ],
                    batch_size = 1
                )

    text_generation = TextGeneration(
                        name = "scam_dataset_generation",
                        llm=TogetherLLM(model = MODEL_ID, api_key = LLM_API_KEY))

    load_dataset.connect(text_generation)

In [None]:
scam_dataset = pipeline.run(
    parameters = {
        text_generation.name: {
            "llm": {
                "generation_kwargs": {
                    "temperature": TEMPERATURE,
                    "max_new_tokens": 512,
                }
            }
        }
    }
)

In [None]:
scam_dataset

In [None]:
scam_dataset['default']['train'][0]

In [None]:
self_instruct = SelfInstruct(
    name="text-generation",
    num_instructions=8,
    application_description="scam message generator",
    input_batch_size=8,
    llm=TogetherLLM(
        model=MODEL_ID,
        api_key=LLM_API_KEY
    ),
    pipeline=Pipeline(name="self-instruct-pipeline")
)

# remember to call .load() if testing outside of a Pipeline context
self_instruct.load()

In [None]:
result = next(
    self_instruct.process(
        [
            {
                "input": sample_data[1],
            },
        ]
    )
)

In [None]:

print(json.dumps(result, indent=2))

In [19]:
with open('complete_scam_augmented_data.txt', 'a') as output:
  for data_idx in range (0, len(conversation_data)):
    for evol in range (0, 10):
      evol_instruct = EvolInstruct(
          name="evol-instruct",
          num_evolutions=2,
          store_evolutions=True,
          input_batch_size=8,
          llm=TogetherLLM(
              model=MODEL_ID,
              api_key=LLM_API_KEY,
              generation_kwargs = {
                "max_new_tokens": 1024,
              }
          ),
          pipeline=Pipeline(name="evol-instruct-pipeline")
      )

      # remember to call .load() if testing outside of a Pipeline context
      evol_instruct.load()

      result = next(
        evol_instruct.process(
            [
                {
                    "instruction": conversation_data[data_idx]
                }
            ]
        )
      )

      output.write(json.dumps(result[0]['evolved_instructions'][0], indent=2) + "\n" + "\n")
      # print(json.dumps(result[0]['evolved_instructions'][0], indent=2))


In [None]:
result = next(
    evol_instruct.process(
        [
            {
                "instruction": "scammer: Hey there! I recently came across your profile and would like to connect with you about a financial opportunity, user: What kind of opportunity are you talking about?"
            },
            {
                "instruction": "scammer: Yes, I was the winner of a big jackpot in the recent lottery! I feel so lucky Yes, I won big time in the recent lottery! I must be one of the luckiest people on Earth!, victim: Really? Congratulations, that's amazing news! How much did you win?, scammer: Thank you. I won over $5 million dollars! However, I need to deposit $10,000 to unlock my winnings., victim: Wow, that's a lot of money for unlocking! But, I understand. How can I assist you to complete this process?, scammer: Please send the $10,000 to this secure bank account. Once the transaction is completed, I will receive my winnings., victim: Okay, here's my bank account number for the transfer, scammer: That is perfect, thank you. Upon receiving the transaction we will process the withdrawal of your winnings and proceed with the transfer to your account."
            }
        ]
    )
)

In [None]:
print(json.dumps(result[1]['evolved_instructions'], indent=2))

# Script for Data Augmentation

In [None]:
"Bank Representative: Dear customer, we noticed that your KYC details are outdated. To ensure the security of your account, we recommend updating your KYC details within the next few days. You can do this by visiting our official website or the nearest branch., User: Oh! I didn’t realize my KYC details were outdated. How can I update them?, Bank Representative: No worries, Sir. You can easily update your KYC details by logging into your online banking account through our official website. Please make sure you are on our secure website by typing the URL directly into your browser, or you can visit your nearest branch for assistance., User: “The link you provided isn’t working., Bank Representative: I apologize for any inconvenience, Sir. Please avoid using any links from unknown sources. Instead, visit our official website or contact our customer service for help. We’ll be happy to guide you through the process., User: Alright, I’ll do that. Thank you for the assistance., Bank Representative: You’re welcome! If you have any further questions or need additional support, please don’t hesitate to contact us. Your security is our top priority."

## Experiment

In [None]:
class ScamType(str, Enum):
    phishing = "phishing scam"
    lottery = "lottery scam"
    charity = "charity scam"
    investment = "investment scam"
    email = "email scam"
    romance = "romance scam"

# class Character(BaseModel):
#     name: Annotated[str, StringConstraints(max_length=30)]
#     scamType: ScamType

with Pipeline("Scam-generation") as pipeline:
    system_prompt = (
        "You are a scam conservation generator. You have seen thousands of conversations between scammers and normal people."
        " Please return a JSON object with a conversation between scammer and user. All conversations should have same format"
    )

    load_dataset = LoadDataFromDicts(
        name="load_instructions",
        data=[
            {
                "system_prompt": system_prompt,
                "instruction": f"Give me a scam conversation for {scam}",
            }
            for scam in ["phishing scam", "lottery scam", "charity scam", "investmant scam", "email scam", "romance scam"]
        ],
    )
    # llm = LlamaCppLLM(
    #     model=MODEL_ID,  # type: ignore
    #     n_gpu_layers=-1,
    #     n_ctx=1024,
    #     structured_output={"format": "json", "schema": Character},
    # )
    # Change to vLLM as such:
    # llm = vLLM(
    #     model="teknium/OpenHermes-2.5-Mistral-7B",
    #     extra_kwargs={"tensor_parallel_size": 1},
    #     structured_output={"format": "json", "schema": Character},
    # )

    llm = TogetherLLM(
        model=MODEL_ID,
        api_key=LLM_API_KEY,
    )

    text_generation = TextGeneration(
        name="scam_generation",
        llm=llm,
        input_batch_size=8,
        output_mappings={"model_name": "generation_model"},
    )
    load_dataset >> text_generation


if __name__ == "__main__":
    distiset = pipeline.run(
        parameters={
            text_generation.name: {
                "llm": {"generation_kwargs": {"max_new_tokens": 256}}
            }
        },
        use_cache=False,
    )
    for num, character in enumerate(distiset["default"]["train"]["generation"]):
        print(f"Dataset: {num}")
        print(character)