# **Training Notebook**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import joblib
import os

# Load data
data_path = 'ec2_network_in_257a54.csv'

try:
    df = pd.read_csv(data_path, nrows=10000)
except FileNotFoundError:
    print(f"Error: The file '{data_path}' was not found.")
    exit()

# Rename columns
df.rename(columns={'value': 'network_in', 'timestamp': 'Timestamp'}, inplace=True)

# Convert Timestamp and set as index
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

# Handle missing values
df = df.fillna(df['network_in'].mean())

# Feature Engineering: Scaling
scaler = StandardScaler()
df['network_in_scaled'] = scaler.fit_transform(df[['network_in']])

# Split Data (Use ONLY the scaled feature)
X = df[['network_in_scaled']]  # Only use the scaled feature
y = df['network_in']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train the Isolation Forest Model
model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
model.fit(X_train)

# Save the model and the scaler
model_filename = 'isolation_forest_model.joblib'
scaler_filename = 'scaler.joblib'

joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename)  # Save scaler with the name scaler.joblib

print(f"Model saved to: {model_filename}")
print(f"Scaler saved to: {scaler_filename}")

Model saved to: isolation_forest_model.joblib
Scaler saved to: scaler.joblib
