In [None]:
import pandas as pd
from transformers import pipeline
import os
import sys 
sys.path.append('/Users/henrychang/sys_security_ai')
from utility import get_logger, config_file_loc, load_config, set_working_directory, check_and_set_device

# Set up logging configuration
import logging
logger = get_logger()

class CybersecurityDetector:
    def __init__(self, input_file, output_file):
        """
        Initializes the CybersecurityDetector with input and output file paths.

        Parameters:
            input_file (str): Path to the input CSV file.
            output_file (str): Path to save the output CSV file.
        """
        self.input_file = input_file
        self.output_file = output_file
        self.df = pd.DataFrame()
        try:
            # Initialize the threat detection model
            device = check_and_set_device()
            self.threat_detection_model = pipeline('text-generation', model='gpt2-large', max_length=50, truncation=True, device=device)
            # self.threat_detection_model = pipeline("text-generation", model="gpt-4")
            # May use model like 'gpt-3.5-turbo' or 'gpt-4', if it's available locally or supported by transformers pipeline.
            logger.info("Threat detection model initialized")
        except Exception as e:
            logger.error(f"Error initializing threat detection model: {e}")

    def load_data(self):
        """
        Loads data from the specified CSV file into a DataFrame (self.df).
        """
        try:
            self.df = pd.read_csv(self.input_file)
            logger.info("Data loaded")
        except FileNotFoundError:
            logger.error(f"Error: File {self.input_file} not found.")
        except pd.errors.EmptyDataError:
            logger.error("Error: No data in the file.")
        except pd.errors.ParserError:
            logger.error("Error: Parsing error.")
        except Exception as e:
            logger.error(f"Error during data loading: {e}")

    def preprocess_data(self):
        """
        Fills missing values in numeric, categorical, datetime, and boolean columns with appropriate replacements.
        """
        try:
            for col in self.df.columns:
                if self.df[col].dtype in ['float64', 'int64']:  # Numeric columns
                    self.df[col].fillna(self.df[col].mean())  # Replace with mean (or use median, 0, etc.)
                elif self.df[col].dtype == 'object':  # Categorical columns
                    self.df[col].fillna(self.df[col].mode()[0])  # Replace with mode (most frequent value)
                elif pd.api.types.is_datetime64_any_dtype(self.df[col]):  # Datetime columns
                    self.df[col].fillna(self.df[col].min())  # Replace with earliest date (or a default date)
                elif self.df[col].dtype == 'bool':  # Boolean columns
                    self.df[col].fillna(False)  # Replace with False (or True, or majority value)
            logger.info("Data preprocessed")
        except Exception as e:
            logger.error(f"Error during preprocessing: {e}")

    def simple_filter(self):
        """
        Applies a simple rule-based filter to remove non-threat rows based on certain conditions.
        """
        try:
            # Define conditions to filter out non-threat rows
            # Define conditions to filter out non-threat rows 
            filter_conditions = [ 'login successful', 
                                 'system update', 
                                 'user logged in', 
                                 'user logged out', 
                                 'system rebooted', 
                                 'file saved', 
                                 'file opened', 
                                 'file closed', 
                                 'session ended', 
                                 'session started', 
                                 'heartbeat message', 
                                 'backup completed', 
                                 'scheduled task completed', 
                                 'configuration updated' ]
            self.df = self.df[~self.df['log_text'].str.contains('|'.join(filter_conditions), case=False)]
            logger.info("Simple filtering applied")
        except Exception as e:
            logger.error(f"Error during simple filtering: {e}")

    def filter_anomalies(self):
        """
        Filters data to include only anomalies based on the anomaly_score column.
        """
        try:
            self.df = self.df[self.df['anomaly_score'] < 0.5]
            logger.info("Filtered to likely anomalies only")
        except Exception as e:
            logger.error(f"Error during anomaly filtering: {e}")

    '''
    Please note that all the features already created can be used to prepare a sophisticated prompt for more 
    accurate threat prediction. The prompt in detect_threats() is only a simple example.
    '''
    def detect_threats(self):
        """
        Detects threats using a generative AI model by analyzing log entries and their classifications.
        """
        def detect_threat(row):
            try:
                # Create a prompt for threat detection               
                prompt = f"""Log entry: {row['log_text']}
                Sentiment Classification: {row['classification']}
                Predictive Anomaly Label: {row['anomaly_label']}
                Is this potentially a threat?"""
                generated_text = self.threat_detection_model(prompt, max_length=250, num_return_sequences=1)[0]['generated_text']
                return 'Threat' if 'threat' in generated_text.lower() else 'Normal'
            except Exception as e:
                logger.error(f"Error during threat detection for row: {row.name} - {e}")
                return 'Error'
       
        try:
            self.df['threat_detection'] = self.df.apply(detect_threat, axis=1)
            logger.info("Threats detected using generative AI")
        except Exception as e:
            logger.error(f"Error applying threat detection: {e}")

    def save_data(self):
        """
        Saves the DataFrame with threat detection results to the specified output CSV file.
        """
        try:
            self.df.to_csv(self.output_file, index=False)
            logger.info(f"Results saved to {self.output_file}")
        except Exception as e:
            logger.error(f"Error saving data: {e}")

    def run(self):
        """
        Orchestrates the entire workflow by calling the methods in sequence.
        """
        try:
            self.load_data()
            self.preprocess_data()
            self.simple_filter()
            self.filter_anomalies()
            self.detect_threats()
            self.save_data()
        except Exception as e:
            logger.error(f"Error in run process: {e}")

class CustomError(Exception):
    """
    This approach ensures that errors are properly logged and propagated without interfering with the IPython or 
    Jupyter Notebook environment's error handling mechanisms.
    """
    pass
    
# Example usage
if __name__ == "__main__":
    try:
        # Load configuration
        config = load_config(config_file_loc)
        
        if config:
            # Set desired_directory as working_directory
            desired_directory = config.get('desired_directory')
            working_directory = set_working_directory(desired_directory)

            # Get API key
            api_key = config.get('api_key')
            
            if working_directory:
                '''
                # Get input directory of files
                input_dir = config.get('input_dir')
                input_data_path = os.path.join(working_directory, input_dir)
                '''
                # Get output directory of files
                output_dir = config.get('output_dir')
                output_data_path = os.path.join(working_directory, output_dir)

                # Get path of output dictionary for missing_dict
                transform_results_file = config.get('transform_results')
                transform_results_file_path = os.path.join(output_data_path, transform_results_file)
                
                # Get path of output dictionary for missing_dict
                detection_results_file = config.get('detection_results')
                detection_results_file_path = os.path.join(output_data_path, detection_results_file)

                detector = CybersecurityDetector(transform_results_file_path, 
                                     detection_results_file_path)

                try:
                    detector.run()
                except Exception as e:
                    logger.error(f"Error executing run method: {e}")
                    raise CustomError(f"Error executing run method: {e}") from e
    except FileNotFoundError as fnf_error:
        logger.error(f"Configuration file not found: {fnf_error}")
        raise CustomError(f"Configuration file not found: {fnf_error}") from fnf_error
    except Exception as e:
        logger.error(f"Unexpected error in main execution: {e}")
        raise CustomError(f"Unexpected error in main execution: {e}") from e


2025-03-18 19:20:47,186 - INFO - Path for configuration file: /Users/henrychang/sys_two_ai/config/config.json
2025-03-18 19:20:47,187 - INFO - Configuration file loaded successfully
2025-03-18 19:20:47,187 - INFO - Current Working Directory: /Users/henrychang/sys_two_ai
2025-03-18 19:20:47,196 - INFO - Using MPS
Device set to use mps:0
2025-03-18 19:20:48,482 - INFO - Threat detection model initialized
2025-03-18 19:20:48,484 - INFO - Data loaded
2025-03-18 19:20:48,485 - INFO - Data preprocessed
2025-03-18 19:20:48,486 - INFO - Simple filtering applied
2025-03-18 19:20:48,487 - INFO - Filtered to likely anomalies only
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
