In [2]:
from stix2 import Indicator, ObservedData, NetworkTraffic, IPv4Address, Bundle
from datetime import datetime
import pandas as pd
import logging

# path to the CSV file containing the dataset
csv_file_path = 'TRG_dataset01_test_ml.csv'

# chunk size for reading the csv in batches to optimize memory usage and performance
chunk_size = 10000  
chunks = pd.read_csv(csv_file_path, chunksize=chunk_size)

# function to parse timestamps from various formats
def parse_timestamp(timestamp):
    try:
        #first, attempt to parse the timestamp with microseconds
        return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')
    except ValueError:
        try:
            #if that fails, try parsing without microseconds
            return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            try:
                # if that fails, try parsing just the date (without time)
                return datetime.strptime(timestamp, '%Y-%m-%d')
            except ValueError:
                #log an error if all  attempts fail
                logging.error(f"Failed to parse timestamp entirely: {timestamp}")
                return None  # Return None if parsing fails

# function to create an stix indicator object for each row
def create_indicator(row):
    return Indicator(
        name=f"Indicator for {row['threat_type']}",  #name the indicator based on the threat type
        pattern=f"[ipv4-addr:value = '{row['source_ip']}']",  #pattern to indicate a malicious IP
        pattern_type="stix",  # STIX pattern type
        valid_from=parse_timestamp(row['timestamp'])  # set the valid from: timestamp
    )

#function to create an stix ObservedData object for each row
def create_observed_data(row):
    #parse the first and last observed timestammps
    first_observed = parse_timestamp(row['timestamp'])
    last_observed = parse_timestamp(row['timestamp'])

    # if parsing the timestamps failed, log an error and skip
    if first_observed is None or last_observed is None:
        logging.error(f"Invalid timestamp for row: {row}")
        return None  #

    # create stix IPv4Address objects for source and destination IP addresses
    src_ip = IPv4Address(value=row['source_ip'])
    dst_ip = IPv4Address(value=row['destination_ip'])

    #convert source port to an integer (handle errors)
    try:
        src_port = int(row['source_port'])
    except (ValueError, TypeError):
        src_port = None

    #convert destination port to an integer (handle errors)
    try:
        dst_port = int(row['destination_port'])
    except (ValueError, TypeError):
        dst_port = None  # If conversion fails, set to None

    # Create a NetworkTraffic object to describe the network traffic
    network_traffic = NetworkTraffic(
        protocols=[str(row['protocol']).lower()],  # protocol used
        src_ref=src_ip.id,  #reference to source IP address
        dst_ref=dst_ip.id,  #reference to destination IP address
        src_port=src_port,  #source port
        dst_port=dst_port   #destination port
    )

    # Return the ObservedData object with the observed traffic details
    return ObservedData(
        first_observed=first_observed,  # first time the activity observed
        last_observed=last_observed,    # last time the activity observed
        number_observed=1,  #number of times this activity was observed (1 for a single record)
        objects={src_ip.id: src_ip, dst_ip.id: dst_ip, network_traffic.id: network_traffic}  #include the IP addresses and network traffic as objects
    )

# initialize empty lists to store stix indicators and ObservedData
all_indicators = []
all_observed_data = []

# process the csv data in chunks
for chunk in chunks:
    for index, row in chunk.iterrows():
        try:
            #create the indicator and ObservedData
            indicator = create_indicator(row)
            observed_data = create_observed_data(row)

            # if the Indicator was successfully created, append it
            if indicator:
                all_indicators.append(indicator)

            #if the ObservedData was successfully created, append it
            if observed_data:
                all_observed_data.append(observed_data)
        except Exception as e:
            # log errors
            logging.error(f"Error processing row {index}: {e}")

#combine all the objects into a single STIX Bundle
bundle = Bundle(objects=all_indicators + all_observed_data)

# write the STIX Bundle to a .json file
with open('stix_bundle.json', 'w') as f:
    f.write(bundle.serialize(pretty=True))  #serialize the bundle into pretty-printed JSON

# log that the STIX bundle was successfully created and exported
logging.info(f"STIX bundle created and exported to stix_bundle.json")
