In [None]:
import pandas as pd

## Prompts for Generating Synthetic Messages

In [None]:
GENERATE_EXAMPLES_PROMPT_2 = """
You are a helpful assistant. Your job is to generate messages that resemble a given message or list of messages.
The messages that will be provided to you are from a dataset that contains cyber crime related information. These
messages have been sent by people in India to the Indian cyber cell, reporting a cyber crime related incident that
happened with them. Each of these messages have been categorised into 15 categories. These messages are then further
classified into subcategories.
Your task is to generate more messages that resemble the given messages, and which belong to the same category and
subcategory as the given messages. Follow the following instuctions:
1. Go over the provided messages.
2. Observe the category and subcategory provided.
3. Generate 50 messages that are similar to the messages provided. These messages should be in simple English, similar to
the provided messages, or they can be in Hinglish, which is Hindi mixed with English. These messages should resemble how
Indian people write, mixing some Hindi words in English.
Out of the 50 generated messages, 25 should be in English and 25 should be in Hinglish.
4. Check the generated messages and observe whether they resemble the provided messages.
5. Make sure all the generated messages belong to the provided category and subcategory.
6. Make sure all the generated messages are different from each other.
Important: Messages should not be very similar to each other. They should have some differences.
7. Make sure 25 messages are in English and 25 messages are in Hinglish.

Generate output in the following format:
*** Messages
### Message 1
### Message 2
.
.
.
### Message 50

Following input is provided for this task
** Category: {category}
** Sub-category: {subcategory}
** Sample messages: {sample_messages}
"""

In [None]:
GENERATE_EXAMPLES_PROMPT = """
You are a helpful assistant. Your job is to generate messages that resemble a given message or list of messages.
The messages that will be provided to you are from a dataset that contains cyber crime related information. These
messages have been sent by people in India to the Indian cyber cell, reporting a cyber crime related incident that
happened with them. Each of these messages have been categorised into 15 categories. These messages are then further
classified into subcategories.
Your task is to generate more messages that resemble the given messages, and which belong to the same category and
subcategory as the given messages. Follow the following instuctions:
1. Go over the provided messages.
2. Observe the category and subcategory provided.
3. Generate 50 messages that are similar to the messages provided. These messages should be in simple English, similar to
the provided messages, or they can be in Hinglish, which is Hindi mixed with English. These messages should resemble how
Indian people write, mixing some Hindi words in English.
4. Check the generated messages and observe whether they resemble the provided messages.
5. Make sure all the generated messages belong to the provided category and subcategory.

Generate output in the following format:
*** Messages
### Message 1
### Message 2
.
.
.
### Message 50

Following input is provided for this task
** Category: {category}
** Sub-category: {subcategory}
** Sample messages: {sample_messages}
"""

In [None]:
%pip install -qU langchain-openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.7/407.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.2/325.2 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/

/content/drive/My Drive


## Load the Data

In [None]:
df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [None]:
category_set = list(set(list(df["category"])))
subcategory_set = list(set(list(df["sub_category"])))

In [None]:
from collections import defaultdict
category_to_subcategory_map = defaultdict(set)

In [None]:
for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_to_subcategory_map[category].add(subcategory)

In [None]:
category_count = defaultdict(int)
subcategory_count = defaultdict(int)

for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_count[category] += 1
  subcategory_count[subcategory] += 1

In [None]:
category_count

defaultdict(int,
            {'Online and Social Media Related Crime': 12140,
             'Online Financial Fraud': 57434,
             'Online Gambling  Betting': 444,
             'RapeGang Rape RGRSexually Abusive Content': 2822,
             'Any Other Cyber Crime': 10878,
             'Cyber Attack/ Dependent Crimes': 3608,
             'Cryptocurrency Crime': 480,
             'Sexually Explicit Act': 1552,
             'Sexually Obscene material': 1838,
             'Hacking  Damage to computercomputer system etc': 1710,
             'Cyber Terrorism': 161,
             'Child Pornography CPChild Sexual Abuse Material CSAM': 379,
             'Online Cyber Trafficking': 183,
             'Ransomware': 56,
             'Report Unlawful Content': 1})

In [None]:
subcategory_count

defaultdict(int,
            {'Cyber Bullying  Stalking  Sexting': 4089,
             'Fraud CallVishing': 5803,
             'Online Gambling  Betting': 444,
             'Online Job Fraud': 912,
             'UPI Related Frauds': 26856,
             'Internet Banking Related Fraud': 8872,
             nan: 6591,
             'Other': 10878,
             'Profile Hacking Identity Theft': 2073,
             'DebitCredit Card FraudSim Swap Fraud': 10805,
             'EWallet Related Fraud': 4047,
             'Data Breach/Theft': 484,
             'Cheating by Impersonation': 1988,
             'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 504,
             'FakeImpersonating Profile': 2299,
             'Cryptocurrency Fraud': 480,
             'Malware Attack': 521,
             'Business Email CompromiseEmail Takeover': 290,
             'Email Hacking': 349,
             'Hacking/Defacement': 540,
             'Unauthorised AccessData Breach': 1114,
      

In [None]:
category_to_subcategory_map

defaultdict(set,
            {'Online and Social Media Related Crime': {'Cheating by Impersonation',
              'Cyber Bullying  Stalking  Sexting',
              'EMail Phishing',
              'FakeImpersonating Profile',
              'Impersonating Email',
              'Intimidating Email',
              'Online Job Fraud',
              'Online Matrimonial Fraud',
              'Profile Hacking Identity Theft',
              'Provocative Speech for unlawful acts'},
             'Online Financial Fraud': {'Business Email CompromiseEmail Takeover',
              'DebitCredit Card FraudSim Swap Fraud',
              'DematDepository Fraud',
              'EWallet Related Fraud',
              'Fraud CallVishing',
              'Internet Banking Related Fraud',
              'UPI Related Frauds'},
             'Online Gambling  Betting': {'Online Gambling  Betting'},
             'RapeGang Rape RGRSexually Abusive Content': {nan},
             'Any Other Cyber Crime': {'Other'},
 

In [None]:
category_subcategory_list = [['Report Unlawful Content', 'Against Interest of sovereignty or integrity of India'], ['Ransomware', 'Ransomware'], ['Online and Social Media Related Crime', 'Intimidating Email'],
                             ['Online and Social Media Related Crime', 'EMail Phishing'], ['Online and Social Media Related Crime', 'Impersonating Email'], ['Hacking  Damage to computercomputer system etc', 'Damage to computer computer systems etc'],
                             ['Hacking  Damage to computercomputer system etc', 'Website DefacementHacking'], ['Online and Social Media Related Crime', 'Online Matrimonial Fraud'],
                             ['Online Cyber Trafficking', 'Online Trafficking'], ['Cyber Terrorism', 'Cyber Terrorism'], ['Online and Social Media Related Crime', 'Provocative Speech for unlawful acts'],
                             ['Hacking  Damage to computercomputer system etc', 'Email Hacking'], ['Online Financial Fraud', 'Business Email CompromiseEmail Takeover'], ['Online Gambling  Betting', 'Online Gambling  Betting'],
                             ['Child Pornography CPChild Sexual Abuse Material CSAM']]

## Generate Messages for underrepresented classes

In [None]:
sample_messages_for_category = []

for i in category_subcategory_list:
  if len(i) == 1:
    category = i[0]

    subset_df = df[df['category'] == category]
    subset_df = subset_df.sample(frac = 1)
    sample_sentences = list(subset_df["crimeaditionalinfo"])

    if len(sample_sentences) > 10:
      sample_sentences = sample_sentences[:10]

  if len(i) == 2:
    category = i[0]
    subcategory = i[1]

    subset_df = df[df['category'] == category]
    subset_df = subset_df[subset_df['sub_category'] == subcategory]
    subset_df = subset_df.sample(frac = 1)
    sample_sentences = list(subset_df["crimeaditionalinfo"])

    if len(sample_sentences) > 10:
      sample_sentences = sample_sentences[:10]

  sample_messages_for_category.append(sample_sentences)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

In [None]:
prompt = PromptTemplate(
            template=GENERATE_EXAMPLES_PROMPT,
            input_variables=["category", "subcategory", "sample_messages"])

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
chain = prompt | llm

In [None]:
synthetic_generated_messages = []

for i in range(len(category_subcategory_list)):
  print(i)

  if len(category_subcategory_list[i]) == 1:
    category = category_subcategory_list[i][0]
    subcategory = category
  if len(category_subcategory_list[i]) == 2:
    category = category_subcategory_list[i][0]
    subcategory = category_subcategory_list[i][1]

  sample_messages = sample_messages_for_category[i]

  params = {"category": category, "subcategory": subcategory, "sample_messages": sample_messages}
  response = chain.invoke(params)

  messages = str(response.content)

  synthetic_generated_messages.append(messages)

In [None]:
prompt_2 = PromptTemplate(
            template=GENERATE_EXAMPLES_PROMPT_2,
            input_variables=["category", "subcategory", "sample_messages"])

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
chain2 = prompt_2 | llm

In [None]:
synthetic_generated_messages_2 = []

for i in range(len(category_subcategory_list)):
  print(i)

  if len(category_subcategory_list[i]) == 1:
    category = category_subcategory_list[i][0]
    subcategory = category
  if len(category_subcategory_list[i]) == 2:
    category = category_subcategory_list[i][0]
    subcategory = category_subcategory_list[i][1]

  sample_messages = sample_messages_for_category[i]

  params = {"category": category, "subcategory": subcategory, "sample_messages": sample_messages}
  response = chain2.invoke(params)

  messages = str(response.content)

  synthetic_generated_messages_2.append(messages)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [None]:
category_subcategory_list[11]

['Hacking  Damage to computercomputer system etc', 'Email Hacking']

## Parse the AI generated Text to extract Messages

In [None]:
def parse_messages(input_string):
    """
    Parse messages from a formatted string into a list.

    Args:
        input_string (str): String containing messages in the format:
            "*** Messages ### Message 1 [content] ### Message 2 [content] ..."

    Returns:
        list: List of message strings, with each element containing the content of one message
    """
    # Remove the initial "*** Messages" header if present
    if input_string.startswith("*** Messages"):
        input_string = input_string.replace("*** Messages", "", 1)

    # Split the string on "### Message" markers
    parts = input_string.split("### Message")

    # Remove empty strings and strip whitespace
    messages = [part.strip() for part in parts if part.strip()]

    # Remove the message number from the beginning of each message
    messages = [' '.join(msg.split()[1:]) for msg in messages]

    return messages

In [None]:
len(parse_messages(synthetic_generated_messages_2[0]))

50

In [None]:
all_generated_messages = {"category": [], "sub_category": [], "crimeaditionalinfo": []}

for i in range(len(category_subcategory_list)):
  if len(category_subcategory_list[i]) == 1:
    category = category_subcategory_list[i][0]
    subcategory = None
  if len(category_subcategory_list[i]) == 2:
    category = category_subcategory_list[i][0]
    subcategory = category_subcategory_list[i][1]

  generated_msg_1 = parse_messages(synthetic_generated_messages[i])
  generated_msg_2 = parse_messages(synthetic_generated_messages_2[i])
  total_generated_msg = generated_msg_1 + generated_msg_2

  num_generated_msg = len(total_generated_msg)

  category_list = [category]*num_generated_msg
  subcategory_list = [subcategory]*num_generated_msg

  all_generated_messages["category"].extend(category_list)
  all_generated_messages["sub_category"].extend(subcategory_list)
  all_generated_messages["crimeaditionalinfo"].extend(total_generated_msg)

## Create Synthetic Dataset (CSV File)

In [None]:
synthetic_train_df = pd.DataFrame(all_generated_messages)

In [None]:
synthetic_train_df = synthetic_train_df.sample(frac=1)

In [None]:
len(synthetic_train_df)

1500

In [None]:
synthetic_train_df.to_csv("synthetic_train.csv")