# Clean up datasets

In [7]:
from re import escape
import dask.dataframe as dd
import pandas as pd

# Path to your CSV file
csv_file_path = "~/Downloads/data/companies_sorted.csv"

# Number of random rows to select
n_rows = 1000

# Output text file path
output_file_path = "data/companies.txt"

# Step 1: Load CSV as a Dask DataFrame
df = dd.read_csv(csv_file_path, assume_missing=True)

# Step 2: Get a sample of 100,000 rows (sampling happens at random)
sampled_df = df.sample(frac=n_rows / len(df), replace=False).compute()

# Step 3: Extract the second column (adjust index if needed)
companies = sampled_df.iloc[:, 1]  # Adjust index if needed based on your CSV structure

# Step 4: Remove outer quotes from the values
companies = companies.str.replace('"', '')

# Step 5: Write to a plain text file, one value per line
companies.to_csv(output_file_path, index=False, header=False, quoting=3, escapechar='\\', sep='|')


In [2]:
# Select random domains from a CSV file

import dask.dataframe as dd
import pandas as pd

# Path to your CSV file
csv_file_path = "~/Downloads/data/domains.csv"

# Number of random rows to select
n_rows = 1000

# Output text file path
output_file_path = "data/domains.txt"

# Step 1: Load CSV as a Dask DataFrame
df = dd.read_csv(csv_file_path, assume_missing=True, sep=';')

# Step 2: Get a sample of 100,000 rows (sampling happens at random)
sampled_df = df.sample(frac=n_rows / len(df), replace=False).compute()

# Step 3: Extract the first column (adjust index if needed)
domains = sampled_df.iloc[:, 0]  # Adjust index if needed based on your CSV structure

# Step 4: Remove outer quotes from the values
domains = domains.str.replace('"', '')

# Step 5: Write to a plain text file, one value per line
domains.to_csv(output_file_path, index=False, header=False, quoting=3, escapechar='\\', sep='|')


# Generate Datasets

In [1]:
# Generate IP addresses

from ner.ipgen import generate_ipv4_addresses, generate_ipv6_addresses

with open("data/ipv4_addresses.txt", "w") as f:
    for ip in generate_ipv4_addresses(1000):
        f.write(ip + "\n")

with open("data/ipv6_addresses.txt", "w") as f:
    for ip in generate_ipv6_addresses(1000):
        f.write(ip + "\n")

In [1]:
# Generate URLs

from ner.urlgen import generate_urls

with open("data/urls.txt", "w") as f:
    for url in generate_urls(1000):
        f.write(url + "\n")

In [3]:
from faker import Faker

# Reinitialize Faker instance after environment reset
faker = Faker()

# Generate 1000 unique email addresses using Faker
unique_emails = set()
while len(unique_emails) < 1000:
    unique_emails.add(faker.email(safe=False))

# Write to file
with open("data/emails.txt", "w") as f:
    for email in unique_emails:
        f.write(email + "\n")


In [2]:
from ner.datagen import generate_ner_dataset

generate_ner_dataset("data/ner_dataset.json")
