# Install dependencies

In [1]:
# Install the correct versions before anything else
!pip install -q -U bitsandbytes transformers accelerate
!pip install -q sentence-transformers
!pip install transformers torch
!pip install stix2


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Imports

In [2]:
# Standard library imports
import os
import json
import sqlite3
from collections import defaultdict
from datetime import datetime

# Data processing and numerical computing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display, Markdown

# Machine Learning and NLP
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Deep Learning and Transformers
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)
from huggingface_hub import login

# Utility and Progress
from tqdm import tqdm
import gc

# STIX and Threat Intelligence
from stix2 import (
    Bundle,
    ThreatActor,
    AttackPattern,
    Malware,
    Relationship,
    Indicator,
    KillChainPhase,
    CustomObject,
    ExternalReference
)
from stix2.properties import (
    ListProperty,
    StringProperty,
    IntegerProperty
)

# Kaggle-specific
from kaggle_secrets import UserSecretsClient

2025-04-27 21:18:25.163703: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745788705.380146      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745788705.441953      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# setting device

In [3]:
device='cuda' if torch.cuda.is_available() else 'cpu'

# Load and prepare data

In [4]:
# Connect to your SQLite database
conn = sqlite3.connect('/kaggle/input/ransomwarelivevictims/_reports.db') 
# Load data into DataFrame
query = "SELECT * FROM reports"  
df = pd.read_sql_query(query, conn)
conn.close()
# see values
df.head()

Unnamed: 0,id,source,title,content,url,scrape_date
0,1,Ransomware.live API,Vicarage Court Solicitors - lynx,At Vicarage Court Solicitors we aim to provide...,https://www.ransomware.live/id/VmljYXJhZ2UgQ29...,2025-04-24 21:52:39
1,2,Ransomware.live API,Fleet Canada - silent,Country: Canada | Revenue: 32.00M USD | Employ...,https://www.ransomware.live/id/RmxlZXQgQ2FuYWR...,2025-04-24 21:52:39
2,3,Ransomware.live API,arkansasprimarycare.com - incransom,Arkansas Primary Care Clinic PA is a company t...,https://www.ransomware.live/id/YXJrYW5zYXNwcml...,2025-04-24 21:52:39
3,4,Ransomware.live API,China Harbour Engeneiring Company - devman,450k USD,https://www.ransomware.live/id/Q2hpbmEgSGFyYm9...,2025-04-24 21:52:39
4,5,Ransomware.live API,thederbyhighschool.co.uk - kairos,UK - The Derby High School,https://www.ransomware.live/id/dGhlZGVyYnloaWd...,2025-04-24 21:52:39


## Data explration

####  Victim Industry Distribution

In [5]:
# Extract industries from content (simplified example)
df['industry'] = df['content'].str.extract(r'Industry:\s*([^\n|]+)')

# Clean and standardize
industry_map = {
    'Hospitals & Physicians Clinics': 'Healthcare',
    'Solicitors': 'Legal',
    'Primary Care': 'Healthcare',
    'Pulmonary': 'Healthcare',
    'High School': 'Education'
}
df['industry'] = df['industry'].replace(industry_map).fillna('Other')

fig = px.pie(df, names='industry', 
             title='<b>Victim Industry Distribution</b>',
             hole=0.3)
fig.update_traces(textposition='inside', 
                 textinfo='percent+label')
fig.show()

####  Geographical Heatmap

In [6]:
import re  # Make sure to import re for regex operations

# Improved country extraction with proper regex patterns
df['country'] = df['content'].str.extract(
    r'(?:Country|Location):?\s*([^\n|]+)', 
    flags=re.IGNORECASE
)[0].fillna(
    df['title'].str.extract(r'(USA|UK|United Kingdom|Canada|China)', flags=re.IGNORECASE)[0]
).str.strip()

# Standardize country names
country_mapping = {
    'USA': 'United States',
    'UK': 'United Kingdom',
    'US': 'United States'
}
df['country'] = df['country'].replace(country_mapping)

# Generate the choropleth plot
if df['country'].notna().any():
    country_counts = df['country'].value_counts().reset_index()
    country_counts.columns = ['country', 'count']
    
    fig = px.choropleth(
        country_counts,
        locations='country',
        locationmode='country names',
        color='count',
        title='<b>Geographical Distribution of Attacks</b>',
        color_continuous_scale='Viridis'
    )
    fig.show()
else:
    print("No country data available for mapping")

#### Revenue vs Employee Size Bubble Chart

In [7]:
# Extract numerical values
df['revenue_usd'] = df['content'].str.extract(r'Revenue:\s*[\$]?(\d+\.?\d*)[kKmMbB]?')
df['employees'] = df['content'].str.extract(r'Employees:\s*(\d+)')

# Convert to numeric
df['revenue_usd'] = pd.to_numeric(df['revenue_usd'], errors='coerce')
df['employees'] = pd.to_numeric(df['employees'], errors='coerce')

fig = px.scatter(df.dropna(),
                x='employees',
                y='revenue_usd',
                size='revenue_usd',
                color='industry',
                hover_name='title',
                log_x=True,
                title='<b>Victim Company Size Analysis</b>')
fig.update_layout(xaxis_title="Number of Employees",
                yaxis_title="Revenue (USD)")
fig.show()

# Feature Extraction Using LLM

In [8]:
# set up model
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("mistral-7b")
login(token=hf_token)

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)

#quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)


# pilpeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)
llm.tokenizer.pad_token_id = llm.model.config.eos_token_id
print("Model ready!")


The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0


Model ready!


In [9]:
def build_prompt(content, title=None, tags=None):
    content = str(content)  
    title = str(title) if title else ""
    tags = str(tags) if tags else ""
    return f"""<s>[INST] Analyze the provided content, title, and tags to extract SPECIFIC VALUES for each field:
{{
    "Country": "Extract the country (e.g., 'USA', 'Canada', 'UK') or 'XX' if unknown. Map regions (e.g., 'Arkansas', 'Jamesburg, New Jersey') to countries (e.g., 'USA').",
    "Industry": "Identify a standardized industry (e.g., 'Healthcare', 'Legal', 'Manufacturing') or 'XX' if unknown. Map specific terms (e.g., 'Solicitors' -> 'Legal', 'Casino' -> 'Gambling').",
    "Revenue": "Extract revenue as an integer in USD (e.g., 1000000) or 0 if unknown. Handle ranges (e.g., '5Mto10M' -> 7500000) and formats like 'Xk', 'Xm', 'Xb'.",
    "Employees": "Extract number of employees as an integer (e.g., 500) or 0 if unknown. Handle ranges (e.g., '20to49' -> 30).",
    "Data_types": ["List specific data types (e.g., 'customer data', 'blueprints', 'financial records') or empty list [] if none."]
}}

Search for these patterns:
- Countries: Explicit mentions like 'USA', 'Canada', 'UK', 'Argentina', or infer from context (e.g., 'Little Rock, Arkansas' -> 'USA', 'East Anglia' -> 'UK'). Use a country list: ['USA', 'Canada', 'UK', 'Argentina', 'Brazil', 'Germany', 'Italy', ...].
- Revenue: Formats like '$X', 'X million', 'X billion', 'Xk', 'Xm', 'Xb', or ranges like '5Mto10M'. Convert to integers (e.g., '32.00M USD' -> 32000000, '450k' -> 450000).
- Employees: Numbers or ranges like '20to49' (take midpoint, e.g., 30).
- Industries: Map keywords to standardized categories:
  - 'Solicitors', 'Law Firm', 'Legal' -> 'Legal'
  - 'Hospitals', 'Physicians', 'Medical', 'Pulmonary' -> 'Healthcare'
  - 'Casino' -> 'Gambling'
  - 'Beverage', 'Cacao', 'Manufacturing' -> 'Manufacturing'
  - 'Construction and services', 'Road marking' -> 'Construction'
  - 'Retail', 'Grocery Retail' -> 'Retail'
  - 'Technology', 'Information' -> 'Technology'
  - Others -> 'XX' if unclear
- Data types: Identify terms like 'customer data', 'emails', 'payment records', 'blueprints', 'private data', 'corporate NDA's', 'financial data', '370gb', or empty list [] if none.

Prioritize structured data:
- Parse fields after delimiters like '===>', '|', or labels like 'Country:', 'Revenue:', etc.
- Use title and tags for context (e.g., 'Casino Resort' in title -> 'Gambling' industry).

Example Output:
{{
    "Country": "Canada",
    "Industry": "Retail",
    "Revenue": 32000000,
    "Employees": 109,
    "Data_types": ["customer emails", "payment records"]
}}

Input:
- Content: {content}
- Title: {title}
- Tags: {tags}

Return ONLY the JSON with EXTRACTED VALUES. No explanations. [/INST]"""

In [10]:
# json file creation fn
def extract_json_from_response(raw_text):
    """Robust JSON extraction with multiple fallback methods"""
    try:
        # First try: Find first { and last } and parse what's between them
        start_idx = raw_text.find('{')
        end_idx = raw_text.rfind('}')
        if start_idx == -1 or end_idx == -1:
            raise ValueError("No JSON brackets found")
        
        json_str = raw_text[start_idx:end_idx+1]
        return json.loads(json_str)
    except json.JSONDecodeError:
        # Second try: Look for JSON after the last [/INST] marker
        inst_marker = raw_text.rfind('[/INST]')
        if inst_marker != -1:
            json_part = raw_text[inst_marker+7:]  # 7 is len of '[/INST]'
            try:
                return json.loads(json_part.strip())
            except json.JSONDecodeError:
                pass
        # Third try: Find the first valid JSON object by brute force
        for i in range(len(raw_text)):
            try:
                return json.loads(raw_text[i:])
            except json.JSONDecodeError:
                continue
        raise ValueError("No valid JSON found in response")

In [11]:
# extract data for all groups
groups = df['source'].unique()
print(f"the groups are : {groups}")
batch_size = 2
# Batch Processing
group_profiles = {}

for group in tqdm(groups):
    samples = df[df['source'] == group]['content'].tolist()  
    prompts = [build_prompt(sample) for sample in samples]
    
    try:
        responses = llm(prompts, max_new_tokens=320, do_sample=False, batch_size=batch_size)
        parsed = []
        for batch in responses:
            for r in batch:
                raw_text = r['generated_text']
                try:
                    parsed_json = extract_json_from_response(raw_text)
                    # Validate structure
                    if not all(k in parsed_json for k in ["Country", "Industry", "Revenue", "Employees", "Data_types"]):
                        raise ValueError("Missing required keys")
                    parsed.append(parsed_json)
                except Exception as e:
                    print(f"Parse error for {group[:20]}...: {str(e)[:100]}")
                    print(f"Problematic response (truncated):\n{raw_text[:200]}...")
                    parsed.append({
                        "Country": "ERROR",
                        "Industry": "ERROR",
                        "Revenue": -1,
                        "Employees": -1,
                        "Data_types": []
                    })
        group_profiles[group] = parsed
    except Exception as e:
        print(f"Processing error for {group}: {e}")
        group_profiles[group] = []
    
    torch.cuda.empty_cache()
    gc.collect()

# Save results
with open('group_profiles.json', 'w') as f:
    json.dump(group_profiles, f, indent=2)

the groups are : ['Ransomware.live API']


  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pa

# create STIX data

In [12]:

# Define a custom ThreatActor with allowed extensions
@CustomObject('x-threat-actor', [
    ('x_most_targeted_country', StringProperty(required=True)),
    ('x_most_targeted_industry', StringProperty(required=True)),
    ('x_avg_victim_revenue', IntegerProperty(required=True)),
    ('x_avg_victim_size', IntegerProperty(required=True))
])
class ExtendedThreatActor:
    pass

def load_group_profiles(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def create_stix_objects(group_profiles):
    stix_objects = []
    group_stats = defaultdict(lambda: {
        'countries': defaultdict(int),
        'industries': defaultdict(int),
        'revenues': [],
        'employees': [],
        'data_types': defaultdict(int)
    })

    # Calculate statistics for each group
    for group, profiles in group_profiles.items():
        for profile in profiles:
            if profile.get('Country') != 'ERROR':
                stats = group_stats[group]
                stats['countries'][profile['Country']] += 1
                stats['industries'][profile['Industry']] += 1
                stats['revenues'].append(profile['Revenue'])
                stats['employees'].append(profile['Employees'])
                for data_type in profile['Data_types']:
                    stats['data_types'][data_type] += 1

    # Create STIX objects for each group
    for group, stats in group_stats.items():
        if not stats['revenues'] or not stats['employees']:
            continue

        # Create Threat Actor with custom properties
        threat_actor = ExtendedThreatActor(
            name=group,
            description=f"Ransomware group targeting {max(stats['industries'].items(), key=lambda x: x[1])[0] if stats['industries'] else 'various'} sectors",
            sophistication="intermediate",
            resource_level="organization",
            primary_motivation="financial",
            goals=["financial-gain", "disrupt-operations"],
            allow_custom=True,
            x_most_targeted_country=max(stats['countries'].items(), key=lambda x: x[1])[0] if stats['countries'] else "Unknown",
            x_most_targeted_industry=max(stats['industries'].items(), key=lambda x: x[1])[0] if stats['industries'] else "Unknown",
            x_avg_victim_revenue=int(sum(stats['revenues'])/len(stats['revenues'])) if stats['revenues'] else 0,
            x_avg_victim_size=int(sum(stats['employees'])/len(stats['employees'])) if stats['employees'] else 0,
            external_references=[
                ExternalReference(
                    source_name="Ransomware.live",
                    url=f"https://ransomware.live/#/group/{group.lower()}"
                )
            ]
        )
        
        # Create Malware
        malware = Malware(
            name=f"{group} Ransomware",
            malware_types=["ransomware"],
            is_family=False,
            description=f"Ransomware variant used by {group}",
            allow_custom=True
        )
        
        # Create Attack Pattern
        attack_pattern = AttackPattern(
            name="Data Encryption and Extortion",
            description="Encrypts victim data and demands ransom payment",
            kill_chain_phases=[
                KillChainPhase(
                    kill_chain_name="lockheed-martin-cyber-kill-chain",
                    phase_name="actions-on-objectives"
                )
            ],
            allow_custom=True
        )
        
        # Create Indicator if data exists
        if stats['data_types']:
            top_data_type = max(stats['data_types'].items(), key=lambda x: x[1])[0]
            indicator = Indicator(
                name=f"{group} Data Exfiltration Pattern",
                description=f"Typically exfiltrates {top_data_type} data",
                pattern=f"[file:name MATCHES '{top_data_type.lower().replace(' ', '_')}']",
                pattern_type="stix",
                valid_from=datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
                allow_custom=True
            )
            stix_objects.append(indicator)
        
        # Create relationships
        stix_objects.extend([
            threat_actor,
            malware,
            attack_pattern,
            Relationship(
                relationship_type='uses',
                source_ref=threat_actor.id,
                target_ref=malware.id,
                allow_custom=True
            ),
            Relationship(
                relationship_type='employs',
                source_ref=threat_actor.id,
                target_ref=attack_pattern.id,
                allow_custom=True
            )
        ])
    
    return Bundle(objects=stix_objects, allow_custom=True)

def save_stix_bundle(bundle, output_path):
    with open(output_path, 'w') as f:
        f.write(bundle.serialize(pretty=True))


def main():
    # Load processed profiles
    group_profiles = load_group_profiles('/kaggle/working/group_profiles.json')
    
    # Generate STIX bundle
    stix_bundle = create_stix_objects(group_profiles)
    
    # Save to file
    save_stix_bundle(stix_bundle, '/kaggle/working/ransomware_groups.stix')
    print(f"Generated STIX bundle with {len(stix_bundle.objects)} objects")

if __name__ == "__main__":
    main()

Generated STIX bundle with 6 objects


In [14]:
print("notebook completed!")

notebook completed!
