In [1]:
import pandas as pd
import pickle
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

print("--- Starting ADVANCED Full Project Pipeline ---")

--- Starting ADVANCED Full Project Pipeline ---


In [2]:
# --- 1. Configuration & Setup ---
# =================================
DATA_DIR = 'data'
MODELS_DIR = 'models'
VIS_DIR = 'visualizations'

FILE_CSV = os.path.join(DATA_DIR, 'file.csv')
USERS_CSV = os.path.join(DATA_DIR, 'users.csv')
PROCESSED_DATA_PATH = os.path.join(DATA_DIR, 'processed_data.csv')

RF_PIPELINE_PATH = os.path.join(MODELS_DIR, 'random_forest_pipeline.pkl')
SVM_PIPELINE_PATH = os.path.join(MODELS_DIR, 'svm_pipeline.pkl')
ISO_PIPELINE_PATH = os.path.join(MODELS_DIR, 'isolation_forest_pipeline.pkl')

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(VIS_DIR, exist_ok=True)
print(f"‚úÖ Directories verified/created.")

‚úÖ Directories verified/created.


In [3]:
# --- 2. Exploratory Data Analysis & Visualization ---
# ======================================================
print("\n--- Phase 1: Starting Data Analysis & Visualization ---")
try:
    print("Analyzing users.csv...")
    users_df_analysis = pd.read_csv(USERS_CSV)
    top_roles = users_df_analysis['role'].value_counts().nlargest(10)
    plt.figure(figsize=(12, 7))
    sns.barplot(x=top_roles.values, y=top_roles.index, palette='viridis')
    plt.title('Top 10 Most Common Job Roles', fontsize=16)
    plt.xlabel('Number of Employees', fontsize=12)
    plt.ylabel('Job Role', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, '1_top_job_roles.png'))
    plt.close()
    print("  -> ‚úÖ Chart '1_top_job_roles.png' saved.")

    print("Analyzing file.csv...")
    chunk_reader_analysis = pd.read_csv(FILE_CSV, usecols=['activity', 'date'], chunksize=500_000)
    all_activities = pd.Series(dtype='int64')
    hourly_activity = pd.Series(dtype='int64', index=range(24)).fillna(0)
    for chunk in chunk_reader_analysis:
        all_activities = all_activities.add(chunk['activity'].value_counts(), fill_value=0)
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
        hourly_counts = chunk['date'].dt.hour.value_counts()
        hourly_activity = hourly_activity.add(hourly_counts, fill_value=0)

    top_file_activities = all_activities.nlargest(10).astype(int)
    plt.figure(figsize=(12, 7))
    sns.barplot(x=top_file_activities.values, y=top_file_activities.index, palette='magma')
    plt.title('Top 10 Most Common File Activities', fontsize=16)
    plt.xlabel('Total Count', fontsize=12)
    plt.ylabel('Activity Type', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, '2_top_file_activities.png'))
    plt.close()
    print("  -> ‚úÖ Chart '2_top_file_activities.png' saved.")

    hourly_activity = hourly_activity.fillna(0).astype(int)
    plt.figure(figsize=(12, 7))
    sns.barplot(x=hourly_activity.index, y=hourly_activity.values, color='royalblue')
    plt.title('File System Activity by Hour of the Day', fontsize=16)
    plt.xlabel('Hour of Day (0-23)', fontsize=12)
    plt.ylabel('Number of Activities', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(VIS_DIR, '3_activity_by_hour.png'))
    plt.close()
    print("  -> ‚úÖ Chart '3_activity_by_hour.png' saved.")
    print("--- Analysis Complete ---")
except Exception as e:
    print(f"  -> ‚ùå Error during analysis phase: {e}")


--- Phase 1: Starting Data Analysis & Visualization ---
Analyzing users.csv...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_roles.values, y=top_roles.index, palette='viridis')


  -> ‚úÖ Chart '1_top_job_roles.png' saved.
Analyzing file.csv...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_file_activities.values, y=top_file_activities.index, palette='magma')


  -> ‚úÖ Chart '2_top_file_activities.png' saved.
  -> ‚úÖ Chart '3_activity_by_hour.png' saved.
--- Analysis Complete ---


In [4]:
# --- 3. ADVANCED Data Processing & Feature Engineering ---
# =======================================================
print("\n--- Phase 2: Starting Advanced Data Processing ---")
try:
    users_df = pd.read_csv(USERS_CSV, usecols=['user_id', 'role'])
    if os.path.exists(PROCESSED_DATA_PATH): os.remove(PROCESSED_DATA_PATH)
    
    print("  -> Step 2a: Calculating user activity baselines...")
    chunk_reader_baseline = pd.read_csv(FILE_CSV, usecols=['user', 'date'], chunksize=500_000)
    user_activity_counts = pd.Series(dtype='int64')
    for chunk in chunk_reader_baseline:
        user_activity_counts = user_activity_counts.add(chunk['user'].value_counts(), fill_value=0)
    
    user_baselines = pd.DataFrame({'user': user_activity_counts.index, 'total_actions': user_activity_counts.values})
    user_baselines['avg_actions_per_day'] = user_baselines['total_actions'] / user_baselines['total_actions'].max()
    print("  -> ‚úÖ User baselines calculated.")

    print("  -> Step 2b: Processing data chunks with new features...")
    chunk_reader_process = pd.read_csv(FILE_CSV, usecols=['user', 'activity', 'date'], chunksize=500_000)
    features_to_keep = ['activity', 'role', 'hour_of_day', 'day_of_week', 'is_weekend', 'avg_actions_per_day', 'risky']

    for i, chunk in enumerate(chunk_reader_process):
        merged_chunk = pd.merge(chunk, users_df, left_on='user', right_on='user_id', how='left')
        merged_chunk = pd.merge(merged_chunk, user_baselines[['user', 'avg_actions_per_day']], on='user', how='left')
        merged_chunk['date'] = pd.to_datetime(merged_chunk['date'], errors='coerce')
        merged_chunk['hour_of_day'] = merged_chunk['date'].dt.hour
        merged_chunk['day_of_week'] = merged_chunk['date'].dt.dayofweek
        merged_chunk['is_weekend'] = merged_chunk['day_of_week'].isin([5, 6]).astype(int)
        merged_chunk['risky'] = ((merged_chunk['hour_of_day'] < 8) | (merged_chunk['hour_of_day'] > 18) | (merged_chunk['is_weekend'] == 1)).astype(int)
        
        processed_chunk = merged_chunk[features_to_keep].dropna()
        processed_chunk.to_csv(PROCESSED_DATA_PATH, mode='a', index=False, header=(i == 0))
        if (i+1) % 5 == 0: print(f"    -> Processed chunk {i+1}...")

    print("\nLoading final processed dataset...")
    df_model = pd.read_csv(PROCESSED_DATA_PATH)
    print("‚úÖ Final dataset with advanced features loaded.")

except Exception as e:
    print(f"\n‚ùå Error during data processing: {e}")
    exit()



--- Phase 2: Starting Advanced Data Processing ---
  -> Step 2a: Calculating user activity baselines...
  -> ‚úÖ User baselines calculated.
  -> Step 2b: Processing data chunks with new features...
    -> Processed chunk 5...

Loading final processed dataset...
‚úÖ Final dataset with advanced features loaded.


In [5]:
# --- 4. Advanced Preprocessing with Pipelines ---
# ================================================
print("\n--- Phase 3: Preparing Data and Building Preprocessing Pipeline ---")
if len(df_model) > 200_000:
    df_model = df_model.sample(n=200_000, random_state=42)
    print("‚öôÔ∏è Dataset sampled for faster training.")

target = 'risky'
X = df_model.drop(target, axis=1)
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

categorical_features = ['activity', 'role']
numerical_features = ['hour_of_day', 'day_of_week', 'is_weekend', 'avg_actions_per_day']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
print("‚úÖ Preprocessing pipeline created.")


--- Phase 3: Preparing Data and Building Preprocessing Pipeline ---
‚öôÔ∏è Dataset sampled for faster training.
‚úÖ Preprocessing pipeline created.


In [6]:
# --- 5. Train All 3 Models with ADVANCED Tuning ---
# =====================================================
print("\n--- Phase 4: Training All Models ---")

# --- Model 1: Random Forest with Hyperparameter Tuning ---
print("\n--- Training Random Forest Pipeline with Tuning ---")
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])

param_dist = {
    'classifier__n_estimators': [100, 150],
    'classifier__max_depth': [10, 20, 30]
}
random_search = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist, n_iter=4, cv=3, random_state=42, n_jobs=-1)

start_time = time.time()
random_search.fit(X_train, y_train)
print(f"Best RF Parameters: {random_search.best_params_}")
best_rf_pipeline = random_search.best_estimator_
preds = best_rf_pipeline.predict(X_test)
print(f"Tuned Random Forest Accuracy: {accuracy_score(y_test, preds)*100:.2f}%")
print(classification_report(y_test, preds))
with open(RF_PIPELINE_PATH, 'wb') as f: pickle.dump(best_rf_pipeline, f)
print(f"‚úÖ Tuned Random Forest pipeline saved.")
print(f"‚è±  Execution Time: {time.time() - start_time:.2f} seconds")

# --- Model 2: Linear SVM ---
print("\n--- Training Linear SVM Pipeline ---")
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(max_iter=2000, class_weight='balanced', random_state=42, dual=False))
])
start_time = time.time()
svm_pipeline.fit(X_train, y_train)
preds = svm_pipeline.predict(X_test)
print(f"Linear SVM Accuracy: {accuracy_score(y_test, preds)*100:.2f}%")
with open(SVM_PIPELINE_PATH, 'wb') as f: pickle.dump(svm_pipeline, f)
print(f"‚úÖ Linear SVM pipeline saved.")
print(f"‚è±  Execution Time: {time.time() - start_time:.2f} seconds")


# --- Model 3: Isolation Forest ---
print("\n--- Training Isolation Forest Pipeline ---")
contamination = y_train.value_counts(normalize=True).get(1, 0.01)
iso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', IsolationForest(n_estimators=100, contamination=contamination, random_state=42, n_jobs=-1))
])
start_time = time.time()
iso_pipeline.fit(X_train)
raw_preds = iso_pipeline.predict(X_test)
preds = [1 if p == -1 else 0 for p in raw_preds]
print(f"Isolation Forest Accuracy: {accuracy_score(y_test, preds)*100:.2f}%")
with open(ISO_PIPELINE_PATH, 'wb') as f: pickle.dump(iso_pipeline, f)
print(f"‚úÖ Isolation Forest pipeline saved.")
print(f"‚è±  Execution Time: {time.time() - start_time:.2f} seconds")

print("\n\nüéØ All tasks complete. Your ADVANCED model pipelines are trained and saved! ---")




--- Phase 4: Training All Models ---

--- Training Random Forest Pipeline with Tuning ---
Best RF Parameters: {'classifier__n_estimators': 150, 'classifier__max_depth': 30}
Tuned Random Forest Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36285
           1       1.00      1.00      1.00      3715

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000

‚úÖ Tuned Random Forest pipeline saved.
‚è±  Execution Time: 102.67 seconds

--- Training Linear SVM Pipeline ---
Linear SVM Accuracy: 96.31%
‚úÖ Linear SVM pipeline saved.
‚è±  Execution Time: 1.49 seconds

--- Training Isolation Forest Pipeline ---
Isolation Forest Accuracy: 91.88%
‚úÖ Isolation Forest pipeline saved.
‚è±  Execution Time: 1.49 seconds


üéØ All tasks complete. Your ADVANCED model pipelines are trained and saved! ---
