In [6]:
import csv
import pandas as pd
#We can import(numpy to handle fast computations for arrays)
import numpy as np
from datetime import datetime
import io

#Imprt sklearn(open-source library for machine learning) in order to use Isolation Forest
from sklearn.ensemble import IsolationForest

#We can import skllearn again in order to train/test our split variables, x(independent) and y(dependent)
from sklearn.model_selection import train_test_split

#Step 1: We must load the data from our CSV file using the with open block in order for us to read it
df = pd.read_csv("train_aws_5000.csv")

#We get (10, 50) as our tuple, so this means we have 10 record and 50 features

#We have to have at least 5000+ event objects in order for our datat anaylsis to be accurate
print(df.shape)

#Step 2: We must select features and target(is_malicious) and prepare data that is going to be used in our model
#We must seperate our label columns into and split them into  x(features and y(target) variables
#We use df.drop() in order to know what to drop and where to drop it

#For axis, 0 means rows, and 1 means we are dropping the columns
X = df.drop('is_malicious', axis=1)

y = df['is_malicious']

#We need to deal with any null values and fill in these values into 0
X = X.fillna(0)

#Lets use one time encoding to encode every value
X = pd.get_dummies(X).astype(int)

X = X.astype(float)

#print(X.dtypes)



#Step 3:We must train and test our model before we use our Isolation forest algorithmn to train and test out data

#Method 1 - Manually split the data frame into test a dn test set

#We can split our data into the train and test model - so the first 75% of rows will be in our training data set
# and the rest will be in our test data set
#sev_five_pct = 0.75 * df.shape[0]

#The positions based filtering allows us to set the rows and the columns

#train_set = df.iloc[:sev_five_pct-1, :]

#test_set = df.iloc[sev_five_pct:, :]

#Displays our training data(in rows and columns) and test_data(in rows and columns) in a tuple
#train_set.shape, test_set.shape

#Method 3 - Use the train_test_split present in the Sklearn in order to split your training data set and your test data set
# You need more than 1 row of data in order for this method to work

#straify allows us to tp preserves the original distribution of a certain variable(usally the taregt/label)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



#BaseLine Isolation Forest

#Step 4: We must use a Isolation Forest algorithmn(best used for anomaly detection)
#n_estimators => Number of trees
#Contination => Expected proportion of anomalies
#Sample Size => number of samples used to train each tree
isolation_forest_algo = IsolationForest(n_estimators=150, contamination=0.2, random_state=42)

#You want to use only the training data in order to avoid leakage
isolation_forest_algo.fit(X_train)

#Step 5: Now we must calcualte the anomaly score for our scores and classify our anomalies
#In our chase we are predicting the score on X_test
X_test_data = df.loc[X_test.index].copy()

#This helps us calcualte our anomaly score(higher score means more normal, lower means more anomalous)
X_test_data['anomaly_score'] = isolation_forest_algo.decision_function(X_test)

#We can classify anomalies with our threshold 0 (default in sklearn)
X_test_data['anomaly'] = isolation_forest_algo.predict(X_test) # -1 for anomaly, 1 for normal

#We access theand anomaly columns and it counts how 
#many times each unqiue value and teturns a series using the default threshold 0
print(X_test_data['anomaly'].value_counts())

#Step 6: We need a quick evaluation of our data through a classificaiton report and confusion matrix

#Confusion Matrix - shows counts of true positives, true negatives, false positives and false negative

from sklearn.metrics import classification_report, confusion_matrix

#We need a y prediction value through the anomalies and we must convert our raw predictions into an integer
#y_prediction = (X_test_data['anomaly'] == -1).astype(int)

#Output: [[0 0] [2 0]], The confusion matrix TN(True Negative),FP(False Positive)
#                                            FN(False Negative,TP(True Positive)

#We are going to make our test size bigger so we dont have as many errors as we do here
#Our events has 50 events(25 benign + 25 malicious)
#We want to make sure we understand out matrix and the classification report

#We are going to create our own threshold so that its not too high since our default value is 0

# The decision function outputs the anomaly score for each sample
# The lower the score, the anomalous the sample is
scores = isolation_forest_algo.decision_function(X_test)

#Our threshold is set at 20th scores(lowest 20% scores are considered anomalies)
threshold = np.percentile(scores, 20)

#We need a y prediction value through the anomalies and we must convert our raw predictions into an integer
#This has created predicted labels(1 for anomaly(score below threshold), 0 for normal(score above threshold)
y_pred = (scores < threshold).astype(int)

# Shows how many normal(0) and anomaly(1) are in training and testing sets and checking if our data is balanced
print("Train labels:\n", y_train.value_counts())
print("Test labels:\n",  y_test.value_counts())

# Now we are going to flag anomalies with our confusion matrix and classification report

print("Baseline evaluation")
#Our Confusion Matrix:
#True Negative(TN) - Correctly Classified as Normal
#False Positive(FP) - Correctly Inccorrectly flagged as anomaly
#False Negative(FN) - Anomaly event incorrectly flagge as normal
#True Positive(TP) - Anomaly event correctlu flagged as anomaly
print(confusion_matrix(y_test, y_pred, labels=[0,1]))


#Classification:
# Precision -> Of all predicted anomalies how many were actually anomalies? Equation Example: Precision for class 1 = TP/( TP + FP )
# Recall -> Of all actual anomalies, how many did the model catch? Equation Example: Recall for class 1 = TP / (TP + FN)
# F1-score: The mean of precision and recall(balance between the two)
# Support: Numbers of true 
# 'zero_division=0' helps us avoid warnings if a class has zero predicted samples
print(classification_report(y_test, y_pred,target_names=["normal", "anomaly"],zero_division=0))

#Now we are going to tune model accuracy(how close predictions made by machines with
# the actual or true values. and reduce false positives

#We can import out accuracy scores from sklearn to see just our accuracy score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy Score: {accuracy}')

#We can tune the model accuracy by using hyper-parameter(external variables use to manage maching learning model training
# grid search that searches for the best combination of hyperparameter values 
# We can use gridsearch CV or RandomizedSearchCV from sklearn

#We are having having make scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Creates a function that maps IsoaltionForest output (-1/1) to 1/0, then F1
# First, define the iso_f1_scorer function correctly
def iso_f1_scorer(estimator,X_val, y_val):
    # Convert isolation forest predictions (-1 for outliers, 1 for inliers)
    # to binary format (1 for outliers, 0 for inliers)
    y_pred_val = (estimator.predict(X_val) == -1).astype(int)
    return f1_score(y_val, y_pred_val, zero_division=0)

# The rest of your code remains the same
hyper_parameter_grid = {'n_estimators': [100], 
                        'max_samples': [0.6,0.8, 1.0],
                        'contamination': [0.01,0.02,0.30],
                        'random_state': [42]
                       }

our_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = GridSearchCV(IsolationForest(random_state=42),
                    param_grid=hyper_parameter_grid,
                    scoring=iso_f1_scorer,
                    cv=our_cv,
                    n_jobs=1,
                    verbose=1,
                    error_score='raise')

# Fit the grid search object into the training data that we have
grid.fit(X_train, y_train)

print("Tuned model")
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)

best_if = grid.best_estimator_
y_pred_best = (best_if.predict(X_test) == -1).astype(int)

print("Test Confusion: ", confusion_matrix(y_test, y_pred_best, labels=[0,1]))
print(classification_report(y_test, y_pred_best, target_names=["normal", "anomaly"], zero_division=0))
print("Test accuracy:", accuracy_score(y_test, y_pred_best))

#Adding the scoring/severity logic to my code

#Higher score = more normal -> invert
X_test_data = df.loc[X_test.index].copy()
X_test_data["anomaly_score"]  = scores
X_test_data["severity_score"] = -scores
X_test_data["anomaly"] = (scores < threshold).astype(int)

#bucket this into tiers
bins = [-np.inf, 0.05, 0.15, 0.30, np.inf]

#Our labels for our 
labels = ['critical', 'high', 'medium', 'low']

X_test_data["severity_tier"] = pd.cut(X_test_data["severity_score"], bins=bins, labels=labels) 

print(X_test_data[["severity_score", "severity_tier", "anomaly"]].head())

#Now we are saving our model for deployment
import joblib

#Save the best model for deployment
joblib.dump(best_if, "first_pipeline.pkl")

model = joblib.load("first_pipeline.pkl")
thresh = 0.05

def handler(event, ctx):
    df = pd.DataFrame(event["records"])
    scores = model.decision_function(df)
    preds = (scores < thresh).astype(int)
    return {"predictions": preds.tolist(), "scores": scores.tolist()}


(5000, 5035)
anomaly
 1    981
-1     19
Name: count, dtype: int64
Train labels:
 is_malicious
0    2048
1    1952
Name: count, dtype: int64
Test labels:
 is_malicious
0    512
1    488
Name: count, dtype: int64
Baseline evaluation
[[469  43]
 [337 151]]
              precision    recall  f1-score   support

      normal       0.58      0.92      0.71       512
     anomaly       0.78      0.31      0.44       488

    accuracy                           0.62      1000
   macro avg       0.68      0.61      0.58      1000
weighted avg       0.68      0.62      0.58      1000

Accuracy Score: 0.62
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Tuned model
Best Parameters: {'contamination': 0.01, 'max_samples': 0.6, 'n_estimators': 100, 'random_state': 42}
Best Score: 0.0
Test Confusion:  [[512   0]
 [488   0]]
              precision    recall  f1-score   support

      normal       0.51      1.00      0.68       512
     anomaly       0.00      0.00      0.00       488

   