In [1]:
import pandas as pd
from transformers import pipeline
import logging
import os

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set the desired working directory 
desired_directory = '/Users/henrychang/sys_two_ai' 
os.chdir(desired_directory) 
# Verify the change 
current_directory = os.getcwd() 
print("Current Working Directory:", current_directory)

class CybersecurityDetector:
    def __init__(self, input_file, output_file):
        """
        Initializes the CybersecurityDetector with input and output file paths.

        Parameters:
            input_file (str): Path to the input CSV file.
            output_file (str): Path to save the output CSV file.
        """
        self.input_file = input_file
        self.output_file = output_file
        self.df = pd.DataFrame()
        try:
            # Initialize the threat detection model
            self.threat_detection_model = pipeline("text-generation", model="gpt-4")
            logger.info("Threat detection model initialized")
        except Exception as e:
            logger.error(f"Error initializing threat detection model: {e}")

    def load_data(self):
        """
        Loads data from the specified CSV file into a DataFrame (self.df).
        """
        try:
            self.df = pd.read_csv(self.input_file)
            logger.info("Data loaded")
        except FileNotFoundError:
            logger.error(f"Error: File {self.input_file} not found.")
        except pd.errors.EmptyDataError:
            logger.error("Error: No data in the file.")
        except pd.errors.ParserError:
            logger.error("Error: Parsing error.")
        except Exception as e:
            logger.error(f"Error during data loading: {e}")

    def preprocess_data(self):
        """
        Fills missing values in numeric, categorical, datetime, and boolean columns with appropriate replacements.
        """
        try:
            for col in self.df.columns:
                if self.df[col].dtype in ['float64', 'int64']:  # Numeric columns
                    self.df[col].fillna(self.df[col].mean(), inplace=True)  # Replace with mean (or use median, 0, etc.)
                elif self.df[col].dtype == 'object':  # Categorical columns
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)  # Replace with mode (most frequent value)
                elif pd.api.types.is_datetime64_any_dtype(self.df[col]):  # Datetime columns
                    self.df[col].fillna(self.df[col].min(), inplace=True)  # Replace with earliest date (or a default date)
                elif self.df[col].dtype == 'bool':  # Boolean columns
                    self.df[col].fillna(False, inplace=True)  # Replace with False (or True, or majority value)
            logger.info("Data preprocessed")
        except Exception as e:
            logger.error(f"Error during preprocessing: {e}")

    def simple_filter(self):
        """
        Applies a simple rule-based filter to remove non-threat rows based on certain conditions.
        """
        try:
            # Define conditions to filter out non-threat rows
            filter_conditions = ['login successful', 'system update']
            self.df = self.df[~self.df['text'].str.contains('|'.join(filter_conditions), case=False)]
            logger.info("Simple filtering applied")
        except Exception as e:
            logger.error(f"Error during simple filtering: {e}")

    def filter_anomalies(self):
        """
        Filters data to include only anomalies based on the anomaly_score column.
        """
        try:
            self.df = self.df[self.df['anomaly_score'] < 0.5]
            logger.info("Filtered to anomalies only")
        except Exception as e:
            logger.error(f"Error during anomaly filtering: {e}")

    def detect_threats(self):
        """
        Detects threats using a generative AI model by analyzing log entries and their classifications.
        """
        def detect_threat(row):
            try:
                # Create a prompt for threat detection
                prompt = f"Log entry: {row['text']}\nClassification: {row['classification']}\nPredictive Anomaly Label: {row['anomaly_label']}\nIs this a threat?"
                generated_text = self.threat_detection_model(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
                return 'Threat' if 'threat' in generated_text.lower() else 'Normal'
            except Exception as e:
                logger.error(f"Error during threat detection for row: {row.name} - {e}")
                return 'Error'
       
        try:
            self.df['threat_detection'] = self.df.apply(detect_threat, axis=1)
            logger.info("Threats detected using generative AI")
        except Exception as e:
            logger.error(f"Error applying threat detection: {e}")

    def save_data(self):
        """
        Saves the DataFrame with threat detection results to the specified output CSV file.
        """
        try:
            self.df.to_csv(self.output_file, index=False)
            logger.info(f"Results saved to {self.output_file}")
        except Exception as e:
            logger.error(f"Error saving data: {e}")

    def run(self):
        """
        Orchestrates the entire workflow by calling the methods in sequence.
        """
        try:
            self.load_data()
            self.preprocess_data()
            self.simple_filter()
            self.filter_anomalies()
            self.detect_threats()
            self.save_data()
        except Exception as e:
            logger.error(f"Error in run process: {e}")

# Example usage
if __name__ == "__main__":
    input_data_dir = current_directory + '/input/'
    output_data_dir = current_directory + '/output/'
    
    detector = CybersecurityDetector(input_data_dir + '/enriched_data.csv', 
                                     output_data_dir + '/detection_results.csv')
    detector.run()


ModuleNotFoundError: No module named 'transformers'