In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import joblib

df = pd.read_csv('../data/simulated_shadow_ai_logs.csv', parse_dates=['timestamp'])

# Map categorical to numbers
dc_map = {'Public':0,'Internal':1,'Confidential':2,'PII':3}
df['data_class_num'] = df['data_class'].map(dc_map)
df['attached_file'] = df['attached_file'].astype(int)
df['is_off_hours'] = df['hour'].apply(lambda h: 1 if h<8 or h>19 else 0)

keywords = ['salary','payroll','ssn','customer id','password','confidential','contract','pii','dob','account']
def kw_score(text):
    t = str(text).lower()
    return sum(k in t for k in keywords)
df['kw_score'] = df['prompt_text'].apply(kw_score)

analyser = SentimentIntensityAnalyzer()
df['sent_neg'] = df['prompt_text'].apply(lambda t: analyser.polarity_scores(str(t))['neg'])
df['prompt_len'] = df['prompt_text'].apply(lambda t: len(str(t).split()))

df['label'] = df['approval_status'].apply(lambda x: 1 if x=='Unapproved' else 0)

features = ['data_class_num','attached_file','is_off_hours','kw_score','sent_neg','prompt_len','response_length']
X = df[features]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=200,max_depth=10,class_weight='balanced',random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_pred))
print("ROC-AUC:",roc_auc_score(y_test,y_proba))

joblib.dump(rf, '../models/rf_model.joblib')
joblib.dump(scaler, '../models/scaler.joblib')
print("âœ… Model and scaler saved!")
