
# Complete Data Protection Project — Full BERT Training Version (Option B)

This notebook contains the **full pipeline**:
1. Full DistilBERT fine-tuning (production-style). Use `QUICK_RUN=False` for full training.
2. Integration pipeline — hybrid regex + BERT inference, masking/encryption, policy validation.
3. Anomaly detection, reports generation.
4. Auto-creates Streamlit dashboard at `dashboards/streamlit_app.py`.

**Important notes before running**:
- This **full** notebook will download transformer weights and can be slow on CPU. Use a GPU runtime if possible.
- Set `QUICK_RUN=True` to run a small quick demo (1 epoch, small subset). For full training, set `QUICK_RUN=False` and ensure you have sufficient RAM and GPU.
- Ensure `data/data_protection_dataset.csv` exists in the project folder.


In [2]:

# Install dependencies (run once)
!pip install -q transformers datasets torch scikit-learn pandas joblib cryptography pyyaml tqdm streamlit accelerate
print('Installed dependencies (or already present).')

Installed dependencies (or already present).


In [14]:

import os, sys, json, math, random, time
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch.nn.functional as F

# allow importing from src/
sys.path.append(os.path.abspath('./src'))

print('Torch', torch.__version__, 'CUDA available:', torch.cuda.is_available())


Torch 2.9.1+cpu CUDA available: False


In [16]:

# Settings - change as needed
DATA_PATH = 'data/data_protection_dataset.csv'
MODEL_DIR = 'models/bert_sensitivity'
os.makedirs('models', exist_ok=True)
os.makedirs('dashboards', exist_ok=True)
os.makedirs('reports', exist_ok=True)
QUICK_RUN = False   # False = full training (Option B). Set True for quick demo.
NUM_EPOCHS = 3 if not QUICK_RUN else 1
BATCH_SIZE = 16 if torch.cuda.is_available() else 8
MAX_LEN = 128

print('DATA_PATH:', DATA_PATH)
print('MODEL_DIR:', MODEL_DIR)
print('QUICK_RUN:', QUICK_RUN, 'EPOCHS:', NUM_EPOCHS, 'BATCH_SIZE:', BATCH_SIZE)


DATA_PATH: data/data_protection_dataset.csv
MODEL_DIR: models/bert_sensitivity
QUICK_RUN: False EPOCHS: 3 BATCH_SIZE: 8


In [18]:

# Load dataset
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Place your CSV there.")
df = pd.read_csv(DATA_PATH)
print('Rows:', len(df))
display(df.head())

# Build text column
def make_text(row):
    parts = []
    for c in ['name','email','phone','address','event_type']:
        v = row.get(c, '') if pd.notna(row.get(c,'')) else ''
        if v and str(v).strip():
            parts.append(str(v))
    parts.append(f"Transaction INR {row.get('transaction_amount','')}")
    return ' | '.join(parts)

df['text'] = df.apply(make_text, axis=1)

# Heuristic labels (for supervised fine-tuning)
def label_row(row):
    if pd.notna(row.get('aadhaar')) and str(row.get('aadhaar')).strip():
        return 2
    if pd.notna(row.get('pan')) and str(row.get('pan')).strip():
        return 2
    if pd.notna(row.get('email')) and str(row.get('email')).strip():
        return 1
    if pd.notna(row.get('phone')) and str(row.get('phone')).strip():
        return 1
    return 0

df['label'] = df.apply(label_row, axis=1)
print('Label distribution:\n', df['label'].value_counts())


Rows: 200


Unnamed: 0,name,email,phone,aadhaar,pan,address,transaction_amount,access_time,ip_address,event_type
0,Yuvaan Shetty,sidhudhruv@devi.com,195064218,5139 5713 7713,FFTIV8802J,"H.No. 22, Saran Circle, Saharanpur-533322",38316.57,2024-01-03 22:53:00,63.223.125.68,LOGIN
1,Yashvi Ahluwalia,dsom@yadav.com,914272822415,3666 6865 8017,KOPMD3006N,"11/21, Tripathi Nagar, Jaipur 804158",33660.28,2024-01-07 07:30:00,200.74.114.217,LOGOUT
2,Jivika Sachar,qravi@bath.biz,49274419,7261 9935 3973,FBLLW7021X,"608, Choudhary Ganj, Howrah 353910",21059.8,2024-01-01 08:04:00,181.5.110.179,FAILED_LOGIN
3,Aradhya Magar,cmaharaj@din.com,918653184494,2112 2893 5177,YXPFE7972M,"66/736, Bawa Nagar, Yamunanagar 938547",19773.25,2024-02-04 09:51:00,208.88.77.128,LOGIN
4,Tara Cherian,wagleritvik@raja.org,5066730025,9895 5214 3164,GPAMD5692N,"H.No. 931, Chaudhari Path, Nizamabad 926737",36964.1,2024-02-04 08:11:00,77.137.248.169,DELETE


Label distribution:
 label
2    200
Name: count, dtype: int64


In [20]:

MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        item = {k: v.squeeze(0) for k,v in enc.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

print('Tokenizer ready. Example tokenization:')
print(tokenizer('Hello world', truncation=True, return_tensors='pt')['input_ids'].shape)


Tokenizer ready. Example tokenization:
torch.Size([1, 4])


In [24]:

# Train/validation split (stratified)
train_df, val_df = train_test_split(df[['text','label']], test_size=0.15, random_state=42, stratify=df['label'])

if QUICK_RUN:
    train_df = train_df.sample(min(500, len(train_df)), random_state=42)
    val_df = val_df.sample(min(200, len(val_df)), random_state=42)

train_dataset = TextDataset(train_df['text'], train_df['label'], tokenizer, max_len=MAX_LEN)
val_dataset = TextDataset(val_df['text'], val_df['label'], tokenizer, max_len=MAX_LEN)

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro' if not QUICK_RUN else 'accuracy',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    seed=42
)

from sklearn.metrics import f1_score
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds), 'f1_macro': f1_score(labels, preds, average='macro')}

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)
print('Trainer configured. EPOCHS:', NUM_EPOCHS)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer configured. EPOCHS: 3


In [26]:

# === TRAINING: run this cell to start fine-tuning ===
print('Beginning training... This may take a long time on CPU. Use GPU if possible.')
train_start = time.time()
trainer.train()
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
train_end = time.time()
print(f'Training finished in {train_end - train_start:.1f} seconds. Model saved to {MODEL_DIR}')


Beginning training... This may take a long time on CPU. Use GPU if possible.




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.011239,1.0,1.0
2,No log,0.003551,1.0,1.0
3,0.104900,0.002822,1.0,1.0




Training finished in 144.8 seconds. Model saved to models/bert_sensitivity


In [28]:

# Integration pipeline helpers: try to import src modules, otherwise use internal fallbacks
try:
    from src.detection.detection_pipeline import detect_row
    from src.policy_engine.policy_validator import load_rules, validate_row
    from src.protection.masking import mask_phone, mask_aadhaar
    from src.protection.encryption import encrypt_value, load_key
    from src.anomaly_detection.anomaly_model import train_anomaly_model, detect_anomalies
    from src.reporting.report_generator import save_compliance_report
    print('Imported project modules from src/.')
except Exception as e:
    print('Falling back to inline implementations (src/ not found or import error):', e)
    import re, yaml
    PAN_RE = re.compile(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b')
    AADHAAR_RE = re.compile(r'\b\d{4}\s?\d{4}\s?\d{4}\b')
    EMAIL_RE = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')
    PHONE_RE = re.compile(r'\b(\+?\d{1,3}[-.\s]?)?(\d{10})\b')
    def detect_patterns(val):
        val = str(val)
        out = []
        if PAN_RE.search(val): out.append('pan')
        if AADHAAR_RE.search(val): out.append('aadhaar')
        if EMAIL_RE.search(val): out.append('email')
        if PHONE_RE.search(val): out.append('phone')
        return out
    def detect_row(row):
        found = {}
        for c in ['name','email','phone','aadhaar','pan','address']:
            v = row.get(c, '')
            res = detect_patterns(v)
            if res: found[c] = res
        return found
    def load_rules(path='config/policy_rules.yaml'):
        if not os.path.exists(path):
            os.makedirs(os.path.dirname(path), exist_ok=True)
            default = {'aadhaar': {'encrypted': True}, 'email': {'allow_plaintext': False}, 'logs': {'contains_sensitive_data': False}}
            with open(path, 'w') as f: yaml.dump(default, f)
        with open(path) as f: return yaml.safe_load(f)
    def validate_row(row, detections, rules):
        violations = []
        if 'aadhaar' in str(detections).lower() or (row.get('aadhaar') and str(row.get('aadhaar')).strip()):
            if rules.get('aadhaar', {}).get('encrypted', False) and 'aadhaar_encrypted' not in row:
                violations.append('aadhaar_plaintext_found')
        if 'email' in str(detections).lower() and not rules.get('email', {}).get('allow_plaintext', True):
            violations.append('email_plaintext_found')
        return violations
    from cryptography.fernet import Fernet
    KEY_FILE = 'config/encryption_key.key'
    def generate_key():
        k = Fernet.generate_key()
        os.makedirs(os.path.dirname(KEY_FILE), exist_ok=True)
        with open(KEY_FILE, 'wb') as f: f.write(k)
        return k
    def load_key():
        if not os.path.exists(KEY_FILE): return generate_key()
        with open(KEY_FILE, 'rb') as f: return f.read()
    def encrypt_value(val):
        key = load_key(); f = Fernet(key); return f.encrypt(str(val).encode()).decode()
    def mask_phone(p):
        s=str(p); return s[:2] + 'XXXXX' + s[-3:] if len(s)>=10 else s
    def mask_aadhaar(a):
        s=str(a).replace(' ',''); return s[:4] + ' XXXX ' + s[-4:] if len(s)>=12 else a
    from sklearn.ensemble import IsolationForest
    def featurize_logs(df_local):
        dfc = df_local.copy(); dfc['access_time'] = pd.to_datetime(dfc['access_time']); dfc['hour'] = dfc['access_time'].dt.hour; dfc['txn_amount'] = dfc['transaction_amount'].fillna(0); dfc['event_code'] = dfc['event_type'].astype('category').cat.codes; X = dfc[['hour','txn_amount','event_code']].fillna(0); return X
    def train_anomaly_model(df_local, save_path='models/anomaly_iforest.pkl'):
        X = featurize_logs(df_local); model_local = IsolationForest(n_estimators=100, contamination=0.02, random_state=42); model_local.fit(X); joblib.dump(model_local, save_path); return model_local
    def detect_anomalies(df_local, model_path='models/anomaly_iforest.pkl'):
        model_local = joblib.load(model_path); X = featurize_logs(df_local); preds = model_local.predict(X); df_local['anomaly'] = (preds == -1); return df_local
    def save_compliance_report(violations_list, out_path='reports/compliance_report.json'):
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        report = {'summary': {'total_violations': len(violations_list)}, 'violations': violations_list}
        with open(out_path, 'w') as f: json.dump(report, f, indent=2)
        return out_path
print('Integration helpers ready.')

Imported project modules from src/.
Integration helpers ready.


In [30]:

# Load saved fine-tuned model (if exists) else use pretrained for inference
if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):
    print('Loading fine-tuned model from', MODEL_DIR)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
else:
    print('Fine-tuned model not found — using pretrained DistilBERT for inference')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
print('Model ready on', device)


Loading fine-tuned model from models/bert_sensitivity
Model ready on cpu


In [32]:

# Build text for BERT and run hybrid detection
df['text_for_bert'] = df['text']

print('Running regex detection...')
df['regex_detections'] = df.apply(detect_row, axis=1)

print('Running BERT inference... (may take a while)')
def bert_predict_batch(texts, tokenizer, model, device, batch_size=32):
    preds, confs = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
        input_ids = enc['input_ids'].to(device)
        attention_mask = enc['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        preds.extend(np.argmax(probs, axis=1).tolist())
        confs.extend(np.max(probs, axis=1).tolist())
    return preds, confs

texts = df['text_for_bert'].astype(str).tolist()
bert_preds, bert_confs = bert_predict_batch(texts, tokenizer, model, device, batch_size=32)
df['bert_pred'] = bert_preds
df['bert_conf'] = bert_confs
label_map = {0:'Public', 1:'Sensitive', 2:'Highly Sensitive'}
df['bert_label'] = df['bert_pred'].map(label_map)

def synthesize_sensitivity(row):
    regs = row.get('regex_detections', {})
    if (pd.notna(row.get('aadhaar')) and str(row.get('aadhaar')).strip()) or ('aadhaar' in str(regs).lower()):
        return 'Highly Sensitive'
    if (pd.notna(row.get('pan')) and str(row.get('pan')).strip()) or ('pan' in str(regs).lower()):
        return 'Highly Sensitive'
    return row.get('bert_label','Public')

df['sensitivity'] = df.apply(synthesize_sensitivity, axis=1)
print('Sensitivity distribution:\n', df['sensitivity'].value_counts())

# Protection + policy validation
print('Applying masking/encryption and validating policies...')
load_key()
rules = load_rules()
violations = []
protected_rows = []

for idx, row in df.iterrows():
    new_row = row.copy()
    if row.get('phone'):
        new_row['phone_masked'] = mask_phone(row['phone'])
    if row.get('aadhaar'):
        new_row['aadhaar_masked'] = mask_aadhaar(row['aadhaar'])
    if row['sensitivity'] == 'Highly Sensitive':
        if row.get('aadhaar'):
            new_row['aadhaar_encrypted'] = encrypt_value(str(row['aadhaar']))
        if row.get('pan'):
            new_row['pan_encrypted'] = encrypt_value(str(row['pan']))
    v = validate_row(row, row.get('regex_detections', {}), rules)
    if v:
        violations.append({'index': int(idx), 'violations': v})
    protected_rows.append(new_row)

protected_df = pd.DataFrame(protected_rows)
protected_df.to_csv('reports/protected_sample.csv', index=False)
print('Saved reports/protected_sample.csv')

save_compliance_report(violations, out_path='reports/compliance_report.json')
print('Saved reports/compliance_report.json')

print('Training anomaly detection and writing anomalies...')
train_anomaly_model(df)
adf = detect_anomalies(df)
adf.to_csv('reports/anomalies_detected.csv', index=False)
print('Saved reports/anomalies_detected.csv. Total anomalies:', int(adf['anomaly'].sum()))


Running regex detection...
Running BERT inference... (may take a while)
Sensitivity distribution:
 sensitivity
Highly Sensitive    200
Name: count, dtype: int64
Applying masking/encryption and validating policies...
Saved reports/protected_sample.csv
Saved reports/compliance_report.json
Training anomaly detection and writing anomalies...




Saved reports/anomalies_detected.csv. Total anomalies: 4


In [34]:
streamlit_code = r'''
import streamlit as st
import pandas as pd, os, json

st.set_page_config(page_title="Data Protection Dashboard", layout="wide")
st.title("AI-Powered Data Protection Dashboard")

if not os.path.exists("reports/protected_sample.csv"):
    st.info("Run the pipeline notebook first to produce reports in reports/.")
else:
    protected = pd.read_csv("reports/protected_sample.csv")
    anomalies = pd.read_csv("reports/anomalies_detected.csv")
    with open("reports/compliance_report.json") as f:
        compliance = json.load(f)

    st.header("Protected Sample (preview)")
    st.dataframe(protected.head(100))

    st.header("Sensitivity distribution")
    st.bar_chart(protected['sensitivity'].value_counts())

    st.header("Policy Violations")
    st.write("Total violations:", compliance.get("summary",{}).get("total_violations",0))
    st.json(compliance.get("violations", []))

    st.header("Anomalies")
    st.write("Total anomalies:", int(anomalies['anomaly'].sum()))
    st.dataframe(anomalies.head(100))

    st.markdown("## Download Reports")
    st.markdown("- reports/protected_sample.csv")
    st.markdown("- reports/anomalies_detected.csv")
    st.markdown("- reports/compliance_report.json")
'''

with open('dashboards/streamlit_app.py','w',encoding='utf-8') as f:
    f.write(streamlit_code)
print('Wrote dashboards/streamlit_app.py')

Wrote dashboards/streamlit_app.py



## How to run this notebook (Full B version)
1. Set `QUICK_RUN=False` for full training (default in this notebook). If you don't have GPU or want a quick test, set `QUICK_RUN=True` and re-run the notebook.
2. Run all cells from top to bottom. Training will begin where indicated and can take a long time on CPU.
3. After the integration cells complete, the `reports/` folder will contain:
   - `protected_sample.csv`
   - `anomalies_detected.csv`
   - `compliance_report.json`
4. Run the Streamlit dashboard from terminal:
   ```bash
   streamlit run dashboards/streamlit_app.py
   ```
   Or:
   ```bash
   python -m streamlit run dashboards/streamlit_app.py
   ```
5. Open `http://localhost:8501` in your browser to view the dashboard.
