In [5]:
'''
AI Agent for Data Transformation:
Combining traditional machine learning (ML) techniques with modern Large Language Models (LLMs) 
enhances the detection and analysis of security threats in log data.

Hybrid Approach:
    Traditional ML Techniques: 
        Methods like Isolation Forests, clustering algorithms, and statistical analysis identify 
    patterns and outliers in structured data.
    Large Language Models: 
        Advanced LLMs (e.g., GPT-3.5 Turbo, GPT-4, or their fin-tuned versions) excel at parsing 
        and interpreting complex log entries, generating contextual insights and responses.
        
Data Processing:
    Stream-based Approach:
        Data are read in and processed sequentially. For example, by our proprietary algorithm, 
        the aggregation features can be prepared efficiently in real-time.
    Filtering Non-Threat Data: 
        Non-relevant data is filtered out initially to reduce costs.
    Data Augmentation: 
        Augmentation is provided as needed for better predictive performance.
        
By integrating these approaches, the AI agent enhances threat detection accuracy through 
a combination of numerical and contextual analysis.
'''
import sys
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import openai
import joblib
import requests
import logging
import os

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set the desired working directory 
desired_directory = '/Users/henrychang/sys_two_ai' 
os.chdir(desired_directory) 
# Verify the change 
current_directory = os.getcwd() 
print("Current Working Directory:", current_directory)

class CybersecurityDataTransformation:
    """
    A class to detect cybersecurity threats using various machine learning models and AI data processing techniques.
 
    Attributes:
        input_file (str): Path to the input file.
        output_file (str): Path to the output file.
        vectorizer_file (str): Path to the vectorizer file.
        model_file (str): Path to the model file.
        scaler_file (str): Path to the scaler file.
        label_encoders_file (str): Path to the label encoders file.
        aggregated_features_file (str): Path to the aggregated features file.
        fine_tune_id_file (str): Path to the fine-tuned model ID file.
        chunk_size (int): Size of data chunks to process. Default is 1000.

    Ensure these parameters are correctly included in the init method
    """

    def __init__(self, input_file, output_file, missing_dict_file, vectorizer_file, model_file, scaler_file, label_encoders_file, aggregated_features_file, fine_tune_id_file, chunk_size=1000):
        self.input_file = input_file
        self.output_file = output_file
        self.missing_dict_file = missing_dict_file
        self.vectorizer_file = vectorizer_file
        self.model_file = model_file
        self.scaler_file = scaler_file
        self.label_encoders_file = label_encoders_file
        self.aggregated_features_file = aggregated_features_file
        self.fine_tune_id_file = fine_tune_id_file
        self.chunk_size = chunk_size
        self.df = pd.DataFrame()
        self.missing_dict = {}
        self.vectorizer = CountVectorizer()
        self.isolation_forest = IsolationForest()
        self.scaler = MinMaxScaler()
        self.label_encoders = {}
        self.aggregated_features = {}
        openai.api_key = 'MY_OPENAI_API_KEY' # Set the OpenAI API key
        # If using the Hugging Face transformers library and the models are available locally or 
        # downloaded from the Hugging Face model hub, we do not need an API key.
        # Initialize the generative model. If resources are limited, use DistilGPT-2; 
        # otherwise, use GPT-4. GPT-3.5 Turbo is a balanced option.
        self.text_generator = pipeline('text-generation', model='gpt-3.5-turbo') 


    def load_fine_tuned_model(self):
        """
        Loads the fine-tuned model using the saved fine_tune_id.

        Returns:
            str: The fine-tuned model identifier.

        Raises:
            SystemExit: If an error occurs while loading the fine-tuned model.
        """
        try:
            # Read the fine-tune ID from the file
            with open(self.fine_tune_id_file, 'r') as f:
                fine_tune_id = f.read().strip()
            logger.info(f"Fine-tune ID read from file: {fine_tune_id}")

            # Retrieve the fine-tune response using the fine-tune ID
            fine_tune_response = openai.FineTune.retrieve(id=fine_tune_id)
            fine_tuned_model = fine_tune_response['fine_tuned_model']
            logger.info(f"Fine-tuned model loaded: {fine_tuned_model}")

            return fine_tuned_model
        except Exception as e:
            logger.error(f"Error loading fine-tuned model: {e}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error


    def load_preprocessors(self):
        """
        Loads the vectorizer, scaler, and label encoders from their respective files.

        Raises:
            SystemExit: If an error occurs while loading any of the preprocessors.
        """
        try:
            self.vectorizer = joblib.load(self.vectorizer_file)
            self.scaler = joblib.load(self.scaler_file)
            self.label_encoders = joblib.load(self.label_encoders_file)
            logger.info("Vectorizer, scaler, and label encoders loaded.")
        except Exception as e:
            logger.error(f"Error loading preprocessors: {e}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error

    def load_isolation_forest(self):
        """
        Loads the trained Isolation Forest model from the model file.

        Raises:
            SystemExit: If an error occurs while loading the Isolation Forest model.
        """
        try:
            self.isolation_forest = joblib.load(self.model_file)
            logger.info("Trained Isolation Forest model loaded.")
        except Exception as e:
            logger.error(f"Error loading trained Isolation Forest model: {e}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error

    def load_aggregated_features(self):
        """
        Loads the aggregated features dictionary from the aggregated features file.

        Raises:
            SystemExit: If an error occurs while loading the aggregated features dictionary.
        """
        try:
            self.aggregated_features = joblib.load(self.aggregated_features_file)
            logger.info("Aggregated features dictionary loaded.")
        except Exception as e:
            logger.error(f"Error loading aggregated features dictionary: {e}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error


    def load_missing_data_replacement(self):
        """
        Loads the dictionary containing precomputed values for missing data.
        Raises:
            SystemExit: If an error occurs while loading the missing values dictionary.
        """
        try:
            # Load the dictionary containing precomputed values for missing data using joblib 
            self.missing_dict = joblib.load(self.missing_dict_file) 
            logger.info("Missing data replacement dictionary loaded successfully")
        except FileNotFoundError:
            logger.error(f"File not found: {self.missing_dict_file}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error
        except Exception as e:
            logger.error(f"Error loading file: {e}")
            sys.exit(1)  # Exit the program with status code 1 indicating an error

    '''
    Key Points to Consider
    Data Distribution: Choose the imputation method based on the distribution of the data.
    Impact on Analysis: Consider how the imputed values might affect subsequent analyses and model performance.
    Consistency: Ensure that the imputation method is applied consistently across similar datasets.
    '''
    def preprocess_data(self, chunk):
        """
        Preprocesses the data chunk by filling missing values using precomputed values from a dictionary.
        Parameters:
            chunk (DataFrame): The data chunk to preprocess.
        Returns:
            DataFrame: The preprocessed data chunk.
        Raises:
            SystemExit: If an error occurs during preprocessing.
        """
        try:
            # Iterate through each column in the chunk to fill missing values
            for col in chunk.columns:
                # Numeric columns, Categorical columns, Boolean columns
                if chunk[col].dtype in ['float64', 'int64', 'object', 'bool']:
                    chunk[col].fillna(self.missing_dict[col], inplace=True)  # Use precomputed value
                elif pd.api.types.is_datetime64_any_dtype(chunk[col]):  # Datetime columns
                    chunk[col].fillna(self.missing_dict[col], inplace=True)  # Use precomputed value
            logger.info("Data preprocessed")
            return chunk
        except Exception as e:
            logger.error(f"Error during preprocessing: {e}")
            sys.exit(1)  # Exit the program if preprocessing fails


    '''
    Remove rows that are obviously not threats.
    '''
    def simple_filter(self, chunk):
        """
        Applies a simple rule-based filter to remove non-threat rows based on certain conditions.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The filtered data chunk.
        Raises:
            None: Logs any exceptions encountered and returns the original chunk.
        """
        try:
            # Define conditions to filter out non-threat rows 
            filter_conditions = [ 'login successful', 
                                 'system update', 
                                 'user logged in', 
                                 'user logged out', 
                                 'system rebooted', 
                                 'file saved', 
                                 'file opened', 
                                 'file closed', 
                                 'session ended', 
                                 'session started', 
                                 'heartbeat message', 
                                 'backup completed', 
                                 'scheduled task completed', 
                                 'configuration updated' ]

            # Apply the filter conditions to the chunk
            chunk = chunk[~chunk['log_text'].str.contains('|'.join(filter_conditions), case=False)]
            logger.info("Simple filtering applied")
            return chunk
        except Exception as e:
            logger.error(f"Error during simple filtering: {e}")
            return chunk

    def vectorize_data(self, chunk):
        """
        Vectorizes the text data in the given data chunk using the pre-trained vectorizer.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            sparse matrix: The vectorized text data.
        Raises:
            None: Logs any exceptions encountered and returns None.
        """
        try:
            # Extract text data from the chunk
            text_data = chunk['log_text']

            # Transform the text data using the pre-trained vectorizer
            vectorized_data = self.vectorizer.transform(text_data)
            return vectorized_data
        except Exception as e:
            logger.error(f"Error during vectorization: {e}")
            return None

    
    '''
    The exact range of these scores depends on the data and the
    specifics of the Isolation Forest model, but they often fall
    within a range approximately between -1 and 1, where:

    Scores closer to 0: Indicate points that are somewhat
    isolated but not extreme.
    Strong negative scores (e.g., below -0.5): Indicate points
    that are strongly considered as anomalies.
    Scores approaching 1: Indicate points that are considered
    normal.
    '''
    def predict_anomalies(self, chunk, vectorized_data):
        """
        Detects anomalies in the given data chunk using the trained Isolation Forest model.
        Parameters:
            chunk (DataFrame): The data chunk to process.
            vectorized_data (sparse matrix): The vectorized data corresponding to the chunk.
        Returns:
            DataFrame: The data chunk with anomaly predictions and scores.
        Raises:
            None: Logs any exceptions encountered and returns None if an error occurs.
        """
        try:
            # Predict anomalies using the Isolation Forest model
            predictions = self.isolation_forest.predict(vectorized_data)
            chunk['anomaly'] = predictions

            # Calculate anomaly scores (derived from the number of splits required to isolate a data point)
            anomaly_scores = self.isolation_forest.decision_function(vectorized_data)
            chunk['anomaly_score'] = anomaly_scores

            logger.info("Anomalies detected and scores assigned")
            return chunk
        except Exception as e:
            logger.error(f"Error during anomaly detection: {e}")
            return None

            
    def postprocess_anomalies(self, chunk):
        """
        Post-processes the anomaly predictions by assigning human-readable labels.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The data chunk with anomaly labels.
        Raises:
            None: Logs any exceptions encountered and returns None if an error occurs.
        """
        try:
            # Assign human-readable labels based on anomaly predictions
            chunk['anomaly_label'] = chunk['anomaly'].apply(lambda x: 'Anomaly' if x == -1 else 'Normal')
            logger.info("Anomaly data post-processed")
            return chunk
        except Exception as e:
            logger.error(f"Error during anomaly post-processing: {e}")
            return None

    '''
    Remove rows that are obviously not threats.

    In practice, the threshold (x) is usually set to 0. So:
    Anomaly Score < 0: Likely an anomaly.
    Anomaly Score ≥ 0: Likely not an anomaly (normal).

    Using a more stringent threshold like 0.5 instead of 0.0 can help in
           identifying anomalies with greater caution.
    Anomaly Score < 0.5: Likely an anomaly.
    Anomaly Score ≥ 0.5: Likely not an anomaly (normal).
    '''
    def filter_anomalies(self, chunk):
        """
        Filters the data chunk to include only likely anomalies based on the anomaly score.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The filtered data chunk containing only likely anomalies.
        Raises:
            None: Logs any exceptions encountered and returns the original chunk if an error occurs.
        """
        try:
            # Filter the chunk to include rows with anomaly scores less than 0.5
            chunk = chunk[chunk['anomaly_score'] < 0.5]
            logger.info("Filtered to likely anomalies only")
            return chunk
        except Exception as e:
            logger.error(f"Error during anomaly filtering: {e}")
            return None

    def categorize_features(self, chunk):
        """
        Encodes categorical features in the data chunk using pre-trained label encoders.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The data chunk with encoded categorical features.
        Raises:
            None: Logs any exceptions encountered and returns None if an error occurs.
        """
        try:
            # Select categorical columns for encoding
            categorical_cols = chunk.select_dtypes(include=['object']).columns

            # Apply label encoders to transform categorical features
            for col in categorical_cols:
                chunk[col] = self.label_encoders[col].transform(chunk[col])
            logger.info("Features categorized")
            return chunk
        except Exception as e:
            logger.error(f"Error during categorization: {e}")
            return None
            
    def normalize_features(self, chunk):
        """
        Normalizes numerical features in the data chunk using the pre-trained scaler.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The data chunk with normalized numerical features.
        Raises:
            None: Logs any exceptions encountered and returns None if an error occurs.
        """
        try:
            # Select numerical columns for normalization
            numerical_cols = chunk.select_dtypes(include=['float64', 'int64']).columns

            # Apply the scaler to normalize numerical features
            chunk[numerical_cols] = self.scaler.transform(chunk[numerical_cols])
            logger.info("Features normalized")
            return chunk
        except Exception as e:
            logger.error(f"Error during normalization: {e}")
            return None
    
    '''
    model='distilbert-base-uncased-finetuned-sst-2-english': 
    Specifies the pre-trained model to use. In this case, it's a DistilBERT model fine-tuned 
    on the SST-2 dataset, which is commonly used for sentiment analysis 
    (e.g., classifying text as positive or negative).
    '''
    def classify_logs_with_llm(self, chunk):
        """
        Classifies log entries in the given data chunk using a pre-trained language model (LLM).
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The data chunk with classified log entries.
        Raises:
            None: Logs any exceptions encountered and returns None.
        """
        try:
            # Initialize the text classification pipeline with a pre-trained model
            classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')

            # Apply the classifier to each log entry and store the classification result
            # 'POSITIVE': Indicates that the text has a positive sentiment.
            # 'NEGATIVE': Indicates that the text has a negative sentiment.
            chunk['classification'] = chunk['text'].apply(lambda x: classifier(x)[0]['label'])

            logger.info("Logs classified with LLM")
            return chunk
        except Exception as e:
            logger.error(f"Error during classification: {e}")
            return None


    '''
    Example of the dictionary:
    {
        'user_1': {
            'count': 7,
            'login_attempts': 3.6666666666666665, 
            'failed_login_attempts': 1.0,
            'session_duration': 50.0,
            'data_transferred': 110.0,
            'access_sensitive_files': 0.3333333333333333, 'count': 3 
        },
        'user_2': {
        ...
        },
      ...
    }
    
    columns of chunk after aggregate_features(chunk):
    user_id  login_attempts  failed_login_attempts  session_duration  data_transferred  access_sensitive_files  count  login_attempts_agg  failed_login_attempts_agg  session_duration_agg  data_transferred_agg  access_sensitive_files_agg
    '''
    def aggregate_features(self, chunk):
        """ 
        Aggregates features from the given data chunk and updates the aggregated features dictionary. 
        Parameters: 
            chunk (DataFrame): The data chunk to process. 
        Returns: 
            DataFrame: The data chunk merged with the aggregated features. 
        """
        try:
            # Iterate through each row in the chunk
            for index, row in chunk.iterrows():
                user_id = row['user_id']
                if user_id not in self.aggregated_features:
                    # Initialize the aggregated features for a new user_id
                    self.aggregated_features[user_id] = {
                        'count': 1,
                        'login_attempts': row['login_attempts'],
                        'failed_login_attempts': row['failed_login_attempts'],
                        'session_duration': row['session_duration'],
                        'data_transferred': row['data_transferred'],
                        'access_sensitive_files': row['access_sensitive_files']
                    }
                else:
                    # Update the aggregated features for an existing user_id
                    self.aggregated_features[user_id]['count'] += 1
                    for feature in ['login_attempts', 'failed_login_attempts', 'session_duration', 'data_transferred', 'access_sensitive_files']:
                        self.aggregated_features[user_id][feature] += (1.0 / self.aggregated_features[user_id]['count']) * (row[feature] - self.aggregated_features[user_id][feature])
           
            # Convert dictionary to DataFrame
            aggregated_df = pd.DataFrame.from_dict(self.aggregated_features, orient='index').reset_index().rename(columns={'index': 'user_id'})
           
            # Merge aggregated data back with the original chunk
            # Perform a left join to ensure the row count of `chunk` remains the same
            chunk = pd.merge(chunk, aggregated_df, on='user_id', how='left', suffixes=('', '_agg'))
           
            logger.info("Aggregated features merged with chunk")
            return chunk
        except Exception as e:
            logger.error(f"Error during aggregation: {e}")
            return None
    
       
    def augment_data(self, chunk):
        """ 
        Augments the data in the given chunk by paraphrasing text and modifying numerical values. 
        Parameters: 
            chunk (DataFrame): The data chunk to process. 
        Returns: 
            DataFrame: The augmented data chunk combined with the original chunk. 
        """
        try:
            def paraphrase_text(row, num_return_sequences=1):
                """ 
                Paraphrases the text in the log entry using a text generation model. 
                Parameters: 
                    row (Series): A row of data containing the log text. 
                    num_return_sequences (int): The number of paraphrased sequences to generate. 
                Returns: 
                    list: A list of paraphrased text sequences. 
                """
                try:
                    paraphrases = self.text_generator(row['log_text'], max_length=50, num_return_sequences=num_return_sequences)
                    return [p['generated_text'] for p in paraphrases]
                except Exception as e:
                    logger.error(f"Error during paraphrasing: {e}")
                    return [row['log_text']]  # Return original text if paraphrasing fails
    
            def augment_numerical(value):
                """ 
                Augments a numerical value by multiplying it with a random factor. 
                Parameters: 
                    value (float or int): The numerical value to augment. 
                Returns: 
                    float or int: The augmented numerical value. 
                """
                try:
                    augmented_value = value * np.random.uniform(0.9, 1.1)
                    if isinstance(value, np.int64):
                        return int(round(augmented_value))
                    else:
                        return augmented_value
                except Exception as e:
                    logger.error(f"Error during numerical augmentation: {e}")
                    return value  # Return original value if augmentation fails
    
            augmented_rows = []
            for index, row in chunk.iterrows():
                try:
                    # Paraphrase the log text
                    paraphrased_logs = paraphrase_text(row, num_return_sequences=2)
    
                    for paraphrased_log in paraphrased_logs:
                        new_row = row.copy()
    
                        # Set new paraphrased log text
                        new_row['log_text'] = paraphrased_log
    
                        # Augment numerical columns
                        for col in chunk.select_dtypes(include=['float64', 'int64']).columns:
                            new_row[col] = augment_numerical(new_row[col])
    
                        augmented_rows.append(new_row)
                except Exception as e:
                    logger.error(f"Error processing row {index}: {e}")
                    continue
            augmented_chunk = pd.DataFrame(augmented_rows)
            return pd.concat([chunk, augmented_chunk], ignore_index=True)
        except Exception as e:
            logger.error(f"Error augmenting data: {e}")
            return chunk  # Return original chunk if augmentation fails
    
    
    '''
    Using row['text'] as Query:
    
    The detect_threat function uses row['text'] as the query to retrieve relevant documents.
    
    Document Retrieval:
    
    The retrieve_documents method uses the query to search for related documents via the Bing Search API.
    
    Augmented Prompt:
    
    The retrieved documents are included as additional context in the prompt.
    '''
    def retrieve_documents(self, query):
        """ 
        Retrieves relevant documents for the given query using the Bing Search API. 
        Parameters: 
            query (str): The search query text. 
        Returns: 
            list: A list of retrieved document snippets relevant to the query. 
        Raises: 
            None: Logs any exceptions encountered and returns None. 
        """
        try:
            # Formulate the search query dynamically based on the log entry text
            search_query = f"System Two Security: {query}"
           
            # Make a request to the Bing Search API
            response = requests.get(
                'https://api.bing.microsoft.com/v7.0/search',
                params={'q': search_query, 'count': 5},
                headers={'Ocp-Apim-Subscription-Key': 'YOUR_BING_SEARCH_API_KEY'}
            )
           
            # Parse the JSON response
            search_results = response.json()
           
            # Extract relevant documents (simplified for illustration)
            retrieved_docs = [result['snippet'] for result in search_results.get('webPages', {}).get('value', [])]
           
            return retrieved_docs
        except Exception as e:
            logger.error(f"Error retrieving documents for query '{query}': {e}")
            return None
    
    
    '''
    Primary Focus on Log Entry: The prompt focuses on the original log entry and its attributes.
    
    Additional Context: The retrieved documents are added as supplementary context, not the main focus.
    
    Balanced Prompt: This ensures that the decision-making process revolves around the log entry while benefiting from additional context.
    '''
    def detect_threats(self, chunk):
        """
        Detects threats in the given data chunk using a generative AI model.
        Parameters:
            chunk (DataFrame): The data chunk to process.
        Returns:
            DataFrame: The processed data chunk with threat detection results.
        """
        def detect_threat(row):
            """
            Detects threats for a single row of data.
            Parameters:
                row (Series): A row of data from the chunk.
            Returns:
                str: 'Threat' if the generated text indicates a threat, 'Normal' otherwise.
            """
            try:
                # Use the log entry text as the query
                query = row['log_text']
                
                # Retrieve relevant documents for the log entry
                retrieved_docs = self.retrieve_documents(query)
                docs_text = "\n".join(retrieved_docs)
                
                # Create the augmented prompt
                prompt = (
                    f"Log entry: {row['text']}\n"
                    f"Classification: {row['classification']}\n"
                    f"Predictive Anomaly Label: {row['anomaly_label']}\n"
                    "Is this a threat?\n"
                    f"Additional context:\n{docs_text}"
                )
                
                # Generate text using text_generator as the threat detection model
                generated_text = self.text_generator(prompt, 
                                                             max_length=50, 
                                                             num_return_sequences=1)[0]['generated_text']
                return 'Threat' if 'threat' in generated_text.lower() else 'Normal'
            except Exception as e:
                logger.error(f"Error during threat detection for row: {row.name} - {e}")
                return 'Error'
    
        try:
            # Apply the detect_threat function to each row in the chunk
            chunk['threat_detection'] = chunk.apply(detect_threat, axis=1)
            logger.info("Threats detected using generative AI")
            return chunk
        except Exception as e:
            logger.error(f"Error applying threat detection: {e}")
            return None
    
    def save_chunk(self, chunk):
        """
        Saves the processed data chunk to the output file.
        Parameters:
            chunk (DataFrame): The data chunk to save.
        """
        try:
            chunk.to_csv(self.output_file, 
                         mode='a', 
                         index=False, 
                         header=not pd.io.common.file_exists(self.output_file))
            logger.info(f"Chunk saved to {self.output_file}")
        except Exception as e:
            logger.error(f"Error saving chunk: {e}")
    
    def save_aggregated_features(self):
        """
        Saves the aggregated features to a file.
        """
        try:
            joblib.dump(self.aggregated_features, self.aggregated_features_file)
            logger.info("Aggregated features saved.")
        except Exception as e:
            logger.error(f"Error saving aggregated features: {e}")
    
    def run(self):
        """ 
        Runs the data processing pipeline in chunks. This method loads the preprocessors, aggregated features, etc. It then processes the input data in chunks, applying various 
        data transformations and saving results. 
        """
        self.load_missing_data_replacement()
        self.load_isolation_forest()
        self.load_preprocessors()
        self.load_aggregated_features()
        self.load_fine_tuned_model()
        # If we need to fine-tune GPT-3.5-Turbo on the specific dataset using OpenAI’s fine-tuning API. 
        # Once fine-tuned, save fine_tune_id locally and upload it to get fine_tuned_model.
        # Additionally, we may implement rag_text_generation(query) to use RAG for better predictive performance.
        fine_tuned_model = self.load_fine_tuned_model()  
        self.text_generator = pipeline('text-generation', model=fine_tuned_model)
       
        try:
            chunks = pd.read_csv(self.input_file, chunksize=self.chunk_size)
            for chunk in chunks:
                try:
                    if (chunk := self.simple_filter(chunk)) is None:
                        continue
                    if (chunk := self.preprocess_data(chunk)) is None:
                        continue
                    if (vectorized_data := self.vectorize_data(chunk)) is None:
                        continue
                    if (chunk := self.predict_anomalies(chunk, vectorized_data)) is None:
                        continue
                    if (chunk := self.postprocess_anomalies(chunk)) is None:
                        continue
                    if (chunk := self.filter_anomalies(chunk)) is None:
                        continue
                    if (chunk := self.categorize_features(chunk)) is None:
                        continue
                    if (chunk := self.normalize_features(chunk)) is None:
                        continue
                    if (chunk := self.classify_logs_with_llm(chunk)) is None:
                        continue
                    if (chunk := self.aggregate_features(chunk)) is None:
                        continue
                    if (chunk := self.augment_data(chunk)) is None:
                        continue
                    if (chunk := self.detect_threats(chunk)) is None:
                        continue
                    self.save_chunk(chunk)
                except Exception as e:
                    logger.error(f"Error processing chunk: {e}")
            self.save_aggregated_features()  # Ensure to call this method outside the inner try-except block
        except FileNotFoundError:
            logger.error(f"Error: File not found.")
        except pd.errors.EmptyDataError:
            logger.error("Error: No data in the file.")
        except pd.errors.ParserError:
            logger.error("Error: Parsing error.")
        except Exception as e:
            logger.error(f"Error loading data in chunks: {e}")


# Example usage
if __name__ == "__main__":
    input_dir = current_directory + '/input/'
    output_dir = current_directory + '/output/'
    
    detector = CybersecurityDataTransformation(input_dir + '/input_data.csv', 
                                               output_dir + '/detection_results.csv', 
                                               output_dir + '/missing_dict.pkl', 
                                               output_dir + '/vectorizer.pkl', 
                                               output_dir + '/isolation_forest_model.pkl', 
                                               output_dir + '/scaler.pkl', 
                                               output_dir + '/label_encoders.pkl', 
                                               output_dir + '/aggregated_features.pkl', 
                                               output_dir + 'fine_tune_id.txt',
                                               chunk_size=1000)
    try:
        detector.run()
    except Exception as e:
        logger.error(f"Error executing run method: {e}")
        sys.exit(1) # Exit the program with a status code 1 indicating an error

Current Working Directory: /Users/henrychang/sys_two_ai


OSError: gpt-3.5-turbo is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`