In [3]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
import gc
from joblib import dump, load
import psutil
warnings.filterwarnings('ignore')

def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024 / 1024  # GB

print("Libraries imported successfully!")
print(f"Initial memory usage: {get_memory_usage():.2f} GB")


Libraries imported successfully!
Initial memory usage: 0.81 GB


In [4]:
def load_csvs_efficiently(csv_files, chunk_size=50000, sample_fraction=None):
    """
    Load CSV files efficiently using chunking to avoid memory overload
    """
    print("=== MEMORY-EFFICIENT DATA LOADING ===")
    
    file_info = []
    total_rows = 0
    
    for file_pattern in csv_files:
        try:
            files = glob.glob(file_pattern)
            if not files:
                files = glob.glob(f"*{file_pattern}*")
            
            for file in files:
                with open(file, 'r') as f:
                    row_count = sum(1 for line in f) - 1  # subtract header
                
                file_info.append((file, row_count))
                total_rows += row_count
                print(f"{file}: {row_count:,} rows")
        
        except Exception as e:
            print(f"Error checking {file_pattern}: {e}")
    
    print(f"Total rows across all files: {total_rows:,}")
    
    # If sampling is requested, calculate sample size
    if sample_fraction:
        sample_size = int(total_rows * sample_fraction)
        print(f"Will sample {sample_size:,} rows ({sample_fraction*100:.1f}%)")
    
    # Load data in chunks
    all_chunks = []
    rows_processed = 0
    
    for file, row_count in file_info:
        print(f"\nProcessing {file}...")
        
        # Calculate skip probability for sampling
        if sample_fraction:
            skip_prob = 1 - sample_fraction
        else:
            skip_prob = 0
        
        chunk_iter = pd.read_csv(file, chunksize=chunk_size, low_memory=True)
        
        for i, chunk in enumerate(chunk_iter):
            # Sample chunk if needed
            if sample_fraction and sample_fraction < 1.0:
                chunk = chunk.sample(frac=sample_fraction, random_state=42)
            
            # Add source file info
            chunk['source_file'] = file.split('/')[-1].replace('.csv', '')
            
            # Optimize dtypes to save memory
            chunk = optimize_dtypes(chunk)
            
            all_chunks.append(chunk)
            rows_processed += len(chunk)
            
            if (i + 1) % 10 == 0:
                print(f"  Processed {i+1} chunks, {rows_processed:,} rows, Memory: {get_memory_usage():.2f} GB")
                
                # Force garbage collection
                gc.collect()
    
    print(f"\nCombining {len(all_chunks)} chunks...")
    combined_df = pd.concat(all_chunks, ignore_index=True)
    
    # Clear chunks from memory
    del all_chunks
    gc.collect()
    
    print(f"Final dataset shape: {combined_df.shape}")
    print(f"Memory usage after loading: {get_memory_usage():.2f} GB")
    
    return combined_df, file_info

def optimize_dtypes(df):
    """Optimize data types to reduce memory usage"""
    for col in df.columns:
        if df[col].dtype == 'int64':
            if df[col].min() >= -128 and df[col].max() <= 127:
                df[col] = df[col].astype('int8')
            elif df[col].min() >= -32768 and df[col].max() <= 32767:
                df[col] = df[col].astype('int16')
            elif df[col].min() >= -2147483648 and df[col].max() <= 2147483647:
                df[col] = df[col].astype('int32')
        
        elif df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    return df

csv_files = ['03-11/LDAP.csv', '03-11/MSSQL.csv', '03-11/NetBIOS.csv', '03-11/Portmap.csv', '03-11/Syn.csv', '03-11/UDP.csv', '03-11/UDPLag.csv']

SAMPLE_FRACTION = 0.1  # Adjust this: 0.1 = 10%, 0.05 = 5%, None = full dataset
CHUNK_SIZE = 10000     # Reduce if still getting memory errors

df, file_info = load_csvs_efficiently(csv_files, 
                                     chunk_size=CHUNK_SIZE, 
                                     sample_fraction=SAMPLE_FRACTION)


=== MEMORY-EFFICIENT DATA LOADING ===
03-11/LDAP.csv: 2,113,234 rows
03-11/MSSQL.csv: 5,775,786 rows
03-11/NetBIOS.csv: 3,455,899 rows
03-11/Portmap.csv: 191,694 rows
03-11/Syn.csv: 4,320,541 rows
03-11/UDP.csv: 3,782,206 rows
03-11/UDPLag.csv: 725,165 rows
Total rows across all files: 20,364,525
Will sample 2,036,452 rows (10.0%)

Processing 03-11/LDAP.csv...
  Processed 10 chunks, 10,000 rows, Memory: 0.82 GB
  Processed 20 chunks, 20,000 rows, Memory: 0.83 GB
  Processed 30 chunks, 30,000 rows, Memory: 0.83 GB
  Processed 40 chunks, 40,000 rows, Memory: 0.84 GB
  Processed 50 chunks, 50,000 rows, Memory: 0.84 GB
  Processed 60 chunks, 60,000 rows, Memory: 0.85 GB
  Processed 70 chunks, 70,000 rows, Memory: 0.86 GB
  Processed 80 chunks, 80,000 rows, Memory: 0.86 GB
  Processed 90 chunks, 90,000 rows, Memory: 0.87 GB
  Processed 100 chunks, 100,000 rows, Memory: 0.87 GB
  Processed 110 chunks, 110,000 rows, Memory: 0.88 GB
  Processed 120 chunks, 120,000 rows, Memory: 0.89 GB
  Proce

KeyboardInterrupt: 

In [None]:
# Cell 3: Memory-Efficient Data Exploration
print("\n=== DATA EXPLORATION ===")
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
print(f"Current system memory: {get_memory_usage():.2f} GB")

# Sample exploration to avoid memory issues
sample_df = df.sample(n=min(1000, len(df)), random_state=42)
print(f"\nColumn data types (from sample):")
print(sample_df.dtypes.value_counts())

print(f"\nFirst few rows:")
print(df.head())

# Memory cleanup
del sample_df
gc.collect()



=== DATA EXPLORATION ===
Dataset shape: (2036452, 89)
Memory usage: 1.61 GB
Current system memory: 1.59 GB

Column data types (from sample):
float64    25
int8       20
float32    20
int32      11
object      7
int16       3
int64       3
Name: count, dtype: int64

First few rows:
   Unnamed: 0                               Flow ID   Source IP   Source Port  \
0       18347  172.16.0.5-192.168.50.4-580-39068-17  172.16.0.5           580   
1      185707   172.16.0.5-192.168.50.4-790-2619-17  172.16.0.5           790   
2       61775  172.16.0.5-192.168.50.4-821-57411-17  172.16.0.5           821   
3      198824  172.16.0.5-192.168.50.4-841-21511-17  172.16.0.5           841   
4        6466  172.16.0.5-192.168.50.4-649-56838-17  172.16.0.5           649   

   Destination IP   Destination Port   Protocol                   Timestamp  \
0    192.168.50.4              39068         17  2018-11-03 10:09:01.351731   
1    192.168.50.4               2619         17  2018-11-03 10:09:01.158

0

In [None]:
# Cell 4: Memory-Efficient Missing Value Handling
print("\n=== MEMORY-EFFICIENT MISSING VALUE HANDLING ===")

def handle_missing_values_efficiently(df, batch_size=100000):
    """Handle missing values in batches to avoid memory issues"""
    
    print("Checking missing values...")
    missing_info = df.isnull().sum()
    cols_with_missing = missing_info[missing_info > 0].index.tolist()
    
    if len(cols_with_missing) == 0:
        print("No missing values found!")
        return df
    
    print(f"Columns with missing values: {len(cols_with_missing)}")
    
    # Calculate fill values first
    fill_values = {}
    
    for col in cols_with_missing:
        if df[col].dtype in ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
            fill_values[col] = df[col].median()
            print(f"{col}: Will fill with median {fill_values[col]}")
        else:
            mode_val = df[col].mode()
            fill_values[col] = mode_val[0] if len(mode_val) > 0 else 'unknown'
            print(f"{col}: Will fill with mode '{fill_values[col]}'")
    
    # Fill missing values
    df.fillna(fill_values, inplace=True)
    
    print("Missing values handled!")
    return df

df = handle_missing_values_efficiently(df)
gc.collect()



=== MEMORY-EFFICIENT MISSING VALUE HANDLING ===
Checking missing values...
Columns with missing values: 1
Flow Bytes/s: Will fill with median 458000000.0
Missing values handled!


0

In [None]:
# Cell 5: Memory-Efficient Data Preparation
print("\n=== MEMORY-EFFICIENT DATA PREPARATION ===")

# Find target column
possible_targets = ['label', 'target', 'attack_type', 'class', 'Label', 'Target', 'Attack_Type', 'Class']
target_col = None

for col in possible_targets:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    # Look for categorical columns with few unique values
    for col in df.columns:
        if col != 'source_file' and df[col].nunique() < 50:
            print(f"Potential target: {col} ({df[col].nunique()} unique values)")
            target_col = col
            break

if target_col is None:
    target_col = df.columns[-2]  # Assume second to last (before source_file)

print(f"Using target column: {target_col}")
print(f"Target distribution:")
print(df[target_col].value_counts())

# Prepare features and target
X = df.drop([target_col, 'source_file'], axis=1, errors='ignore')
y = df[target_col].copy()

# Clean up original dataframe
del df
gc.collect()

print(f"Feature matrix shape: {X.shape}")
print(f"Memory usage: {get_memory_usage():.2f} GB")



=== MEMORY-EFFICIENT DATA PREPARATION ===
Potential target:  Protocol (3 unique values)
Using target column:  Protocol
Target distribution:
 Protocol
17    1543541
6      492767
0         144
Name: count, dtype: int64
Feature matrix shape: (2036452, 87)
Memory usage: 1.49 GB


In [None]:
# Cell 6: Efficient Encoding and Scaling
print("\n=== EFFICIENT ENCODING AND SCALING ===")

# Encode categorical features efficiently
label_encoders = {}
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Encoding {len(categorical_cols)} categorical columns...")
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded: {col}")

# Encode target if categorical
target_encoder = None
if y.dtype == 'object':
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)
    print("Target encoded")

gc.collect()



=== EFFICIENT ENCODING AND SCALING ===
Encoding 6 categorical columns...
Encoded: Flow ID
Encoded:  Source IP
Encoded:  Destination IP
Encoded:  Timestamp
Encoded: SimillarHTTP
Encoded:  Label


0

In [None]:
# Cell 7: Memory-Efficient Train-Test Split
print("\n=== MEMORY-EFFICIENT DATA SPLITTING ===")

# For very large datasets, use stratified sampling for test set
test_size = min(0.2, 200000 / len(X))  # Cap test set at 200k samples
print(f"Test size: {test_size:.4f} ({int(len(X) * test_size):,} samples)")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Memory usage: {get_memory_usage():.2f} GB")

# Clean up full datasets
del X, y
gc.collect()



=== MEMORY-EFFICIENT DATA SPLITTING ===
Test size: 0.0982 (200,000 samples)
Training set: (1836452, 87)
Test set: (200000, 87)
Memory usage: 2.26 GB


0

In [None]:
# Cell 8: Batch Scaling to Avoid Memory Issues
print("\n=== BATCH SCALING ===")

def fit_scaler_in_batches(X_train, batch_size=50000):
    """Fit scaler using batches to avoid memory issues"""
    scaler = StandardScaler()
    
    print("Fitting scaler in batches...")
    n_samples = len(X_train)
    
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch = X_train.iloc[start_idx:end_idx]
        
        scaler.partial_fit(batch)
        
        if (start_idx // batch_size + 1) % 10 == 0:
            print(f"  Processed batch {start_idx // batch_size + 1}/{(n_samples - 1) // batch_size + 1}")
    
    return scaler

# Fit scaler
scaler = fit_scaler_in_batches(X_train, batch_size=50000)

def transform_in_batches(scaler, X, batch_size=50000):
    """Transform data in batches"""
    n_samples = len(X)
    transformed_batches = []
    
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch = X.iloc[start_idx:end_idx]
        transformed_batch = scaler.transform(batch)
        transformed_batches.append(transformed_batch)
    
    return np.vstack(transformed_batches)

print("Scaling training data...")
X_train_scaled = transform_in_batches(scaler, X_train)
print("Scaling test data...")
X_test_scaled = transform_in_batches(scaler, X_test)

print(f"Scaling completed. Memory: {get_memory_usage():.2f} GB")



=== BATCH SCALING ===
Fitting scaler in batches...


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# Cell 9: Memory-Efficient Model Training
print("\n=== MEMORY-EFFICIENT MODEL TRAINING ===")

# Use smaller Random Forest for large datasets
rf_model = RandomForestClassifier(
    n_estimators=50,      # Reduced from 100
    max_depth=15,         # Reduced from 20
    min_samples_split=10, # Increased
    min_samples_leaf=5,   # Increased
    max_features='sqrt',  # Use sqrt instead of all features
    random_state=42,
    n_jobs=2,            # Limit parallel jobs to save memory
    verbose=1            # Show progress
)

print("Training Random Forest model...")
print(f"Training samples: {X_train_scaled.shape[0]:,}")
print(f"Features: {X_train_scaled.shape[1]}")

rf_model.fit(X_train_scaled, y_train)
print("Model training completed!")


In [None]:
# Cell 10: Batch Prediction and Evaluation
print("\n=== BATCH PREDICTION AND EVALUATION ===")

def predict_in_batches(model, X, batch_size=10000):
    """Make predictions in batches to avoid memory issues"""
    predictions = []
    n_samples = len(X)
    
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        batch_pred = model.predict(X[start_idx:end_idx])
        predictions.extend(batch_pred)
    
    return np.array(predictions)

print("Making predictions...")
y_train_pred = predict_in_batches(rf_model, X_train_scaled, batch_size=10000)
y_test_pred = predict_in_batches(rf_model, X_test_scaled, batch_size=10000)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n=== RESULTS ===")
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Classification report
print(f"\n=== CLASSIFICATION REPORT ===")
if target_encoder:
    target_names = target_encoder.classes_
    print(classification_report(y_test, y_test_pred, target_names=target_names))
else:
    print(classification_report(y_test, y_test_pred))

# Feature importance
print(f"\n=== TOP 10 MOST IMPORTANT FEATURES ===")
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))


In [None]:
# Cell 11: Save Models Efficiently
print("\n=== SAVING MODELS ===")

# Use joblib for large models (more efficient than pickle)
print("Saving scaler...")
dump(scaler, 'scaler.pkl', compress=3)
print("Scaler saved as 'scaler.pkl'")

print("Saving model...")
dump(rf_model, 'model.pkl', compress=3)
print("Model saved as 'model.pkl'")

if target_encoder:
    dump(target_encoder, 'target_encoder.pkl', compress=3)
    print("Target encoder saved")

if label_encoders:
    dump(label_encoders, 'label_encoders.pkl', compress=3)
    print("Label encoders saved")

# Save feature names
dump(list(X_train.columns), 'feature_names.pkl', compress=3)
print("Feature names saved")

# Save model metadata
metadata = {
    'train_accuracy': train_accuracy,
    'test_accuracy': test_accuracy,
    'n_features': X_train_scaled.shape[1],
    'n_train_samples': len(X_train_scaled),
    'n_test_samples': len(X_test_scaled),
    'sample_fraction_used': SAMPLE_FRACTION,
    'model_params': rf_model.get_params()
}

dump(metadata, 'model_metadata.pkl')
print("Model metadata saved")

print(f"\n=== FINAL SUMMARY ===")
print(f"✅ Model Training Completed Successfully!")
print(f"📊 Final Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"🔢 Training Samples: {len(X_train_scaled):,}")
print(f"🔢 Test Samples: {len(X_test_scaled):,}")
print(f"📁 Features: {X_train_scaled.shape[1]}")
print(f"💾 Peak Memory Usage: {get_memory_usage():.2f} GB")
print(f"📦 All model files saved successfully!")


In [None]:
# Cell 12: Memory-Efficient Model Loading Test
def test_saved_model():
    """Test loading and using the saved model"""
    print("\n=== TESTING SAVED MODEL ===")
    
    # Load components
    loaded_scaler = load('scaler.pkl')
    loaded_model = load('model.pkl')
    feature_names = load('feature_names.pkl')
    metadata = load('model_metadata.pkl')
    
    print("✅ All components loaded successfully!")
    print(f"📊 Saved model accuracy: {metadata['test_accuracy']:.4f}")
    
    # Test with a small sample
    if len(X_test_scaled) > 0:
        test_sample = X_test_scaled[:5]
        predictions = loaded_model.predict(test_sample)
        print(f"🔮 Sample predictions: {predictions}")
    
    return loaded_model, loaded_scaler

# Run the test
test_saved_model()