In [27]:
'''
The following code is an ML-based classifier. It generates anomaly-related features that can 
filter out non-threat-related events. Additionally, it can be used for data transformation 
during inference, whether in stream-based mode or batch processing mode.
'''

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import joblib
import logging
import os # Get the current working directory 

# Set the desired working directory 
desired_directory = '/Users/henrychang/sys_two_ai' 
os.chdir(desired_directory) 
# Verify the change 
current_directory = os.getcwd() 
print("Current Working Directory:", current_directory)

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class AnomalyClassifier:
    def __init__(self, input_file, vectorizer_file, trained_model):
        """
        Initializes the AnomalyClassifier with the input file, vectorizer file, and trained model.

        Parameters:
            input_file (str): Path to the input CSV file.
            vectorizer_file (str): Path to save the trained CountVectorizer.
            trained_model (str): Path to save the trained Isolation Forest model.
        """
        self.input_file = input_file
        self.vectorizer_file = vectorizer_file
        self.trained_model = trained_model
        self.df = pd.DataFrame()
        self.vectorizer = CountVectorizer()
        self.isolation_forest = IsolationForest(n_estimators=100, contamination=0.1)

    def load_data(self):
        """
        Loads data from the specified CSV file into a DataFrame (self.df).
        """
        try:
            self.df = pd.read_csv(self.input_file)
            logger.info("Data loaded")
        except FileNotFoundError:
            logger.error(f"Error: File {self.input_file} not found.")
        except pd.errors.EmptyDataError:
            logger.error("Error: No data in the file.")
        except pd.errors.ParserError:
            logger.error("Error: Parsing error.")
        except Exception as e:
            logger.error(f"Error during data loading: {e}")

    def preprocess_data(self):
        """
        Fills missing values in numeric, categorical, datetime, and boolean columns with appropriate replacements.
        """
        try:
            for col in self.df.columns:
                if self.df[col].dtype in ['float64', 'int64']:  # Numeric columns
                    self.df[col].fillna(self.df[col].mean(), inplace=True)  # Replace with mean (or use median, 0, etc.)
                elif self.df[col].dtype == 'object':  # Categorical columns
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)  # Replace with mode (most frequent value)
                elif pd.api.types.is_datetime64_any_dtype(self.df[col]):  # Datetime columns
                    self.df[col].fillna(self.df[col].min(), inplace=True)  # Replace with earliest date (or a default date)
                elif self.df[col].dtype == 'bool':  # Boolean columns
                    self.df[col].fillna(False, inplace=True)  # Replace with False (or True, or majority value)
            logger.info("Data preprocessed")
        except Exception as e:
            logger.error(f"Error during preprocessing: {e}")

    def vectorize_data(self):
        """
        Vectorizes text data for anomaly detection and splits it into training and testing sets.

        Returns:
            tuple: Vectorized training data, vectorized testing data, training index, testing index.
        """
        try:
            # Vectorize text data for anomaly detection
            text_data = self.df['log_text']

            # Split data into training and testing sets
            X_train, X_test = train_test_split(text_data, test_size=0.2, random_state=42)

            # Fit and transform the vectorizer on training data, transform testing data
            vectorized_train = self.vectorizer.fit_transform(X_train)
            vectorized_test = self.vectorizer.transform(X_test)

            return vectorized_train, vectorized_test, X_train.index, X_test.index
        except Exception as e:
            logger.error(f"Error during vectorization: {e}")

    def train_isolation_forest(self, vectorized_train, vectorized_test, train_index, test_index):
        """
        Trains the Isolation Forest model on the vectorized training data.
        Predicts anomalies on the testing data and adds predictions to the DataFrame.
        Evaluates model accuracy on test data.

        Parameters:
            vectorized_train (sparse matrix): Vectorized training data.
            vectorized_test (sparse matrix): Vectorized testing data.
            train_index (Index): Index of training data.
            test_index (Index): Index of testing data.
        """
        try:
            # Train isolation forest on the vectorized training data
            self.isolation_forest.fit(vectorized_train)
            logger.info("Isolation Forest model trained.")

            # Predict anomalies on the testing set
            # 1: Indicates that the data point is considered "normal" or "inlier" by the model.
            # -1: Indicates that the data point is considered "anomalous" or "outlier" by the model.
            test_predictions = self.isolation_forest.predict(vectorized_test)
            # print(test_predictions)

            # Add predictions to the DataFrame
            self.df.loc[test_index, 'anomaly'] = test_predictions
            # Evaluate the model
            true_labels = self.df.loc[test_index, 'anomaly_act'].tolist()

            accuracy = accuracy_score(test_predictions, true_labels)  # Assuming anomalies are -1
            logger.info(f"Model accuracy on test data: {accuracy}")
        except Exception as e:
            logger.error(f"Error during isolation forest training: {e}")

    def evaluate_performance(self, test_index):
        """
        Evaluates the performance of the Isolation Forest model on the test data.

        Parameters:
            test_index (Index): Index of the test data.
        """
        try:
            # Compute the confusion matrix
            cm = confusion_matrix(self.df.loc[test_index, 'anomaly_act'], self.df.loc[test_index, 'anomaly'])
            logger.info("Confusion Matrix:")
            logger.info(cm)

            # Print a detailed classification report
            report = classification_report(self.df.loc[test_index, 'anomaly_act'], self.df.loc[test_index, 'anomaly'])
            logger.info("Classification Report:")
            logger.info(report)
        except Exception as e:
            logger.error(f"Error evaluating performance: {e}")

    def save_isolation_forest_model(self):
        """
        Saves the trained Isolation Forest model and vectorizer to files.
        """
        try:
            # Save the trained vectorizer
            joblib.dump(self.vectorizer, self.vectorizer_file)
            logger.info("Vectorizer saved")

            # Save the trained isolation forest model
            joblib.dump(self.isolation_forest, self.trained_model)
            logger.info("Isolation Forest model saved")
        except Exception as e:
            logger.error(f"Error saving Isolation Forest model: {e}")

    def run(self):
        """
        Orchestrates the entire workflow by calling the methods in sequence.
        """
        try:
            self.load_data()
            self.preprocess_data()
            vectorized_train, vectorized_test, train_index, test_index = self.vectorize_data()
            self.train_isolation_forest(vectorized_train, vectorized_test, train_index, test_index)
            self.evaluate_performance(test_index)
            self.save_isolation_forest_model()
        except Exception as e:
            logger.error(f"Error in run process: {e}")

# Example usage
if __name__ == "__main__":
    input_data_dir = current_directory + '/input/'
    processed_result_dir = current_directory + '/output/'
    detector = AnomalyClassifier(input_data_dir + 'data_4_modeling.csv', 
                                 processed_result_dir + 'vectorizer.pkl', 
                                 processed_result_dir + 'isolation_forest_model.pkl')
    detector.run()


2024-12-30 16:28:31,556 - INFO - Data loaded
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)  # Replace with mean (or use median, 0, etc.)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mode()[0], inplace=True)  # Replace with mode (most f

Current Working Directory: /Users/henrychang/sys_two_ai
