In [2]:
"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                 CELL 13: SAVE MODEL AND VECTORIZER                           ║
╚══════════════════════════════════════════════════════════════════════════════╗
"""

print("💾 Saving trained model and vectorizer...")

# Save model
joblib.dump(model, 'autorisk_model.pkl')
print("✅ Saved: autorisk_model.pkl")

# Save vectorizer
joblib.dump(vectorizer, 'autorisk_vectorizer.pkl')
print("✅ Saved: autorisk_vectorizer.pkl")

# Save dataset info
dataset_info = {
    'total_samples': len(df),
    'train_samples': X_train.shape[0],
    'test_samples': X_test.shape[0],
    'features': X.shape[1],
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1),
    'classes': list(model.classes_),
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('model_info.json', 'w') as f:
    json.dump(dataset_info, f, indent=2)
print("✅ Saved: model_info.json")

print("\n📦 Model files ready for deployment!")

# Display file sizes
import os
print("\n📊 File Sizes:")
for filename in ['autorisk_model.pkl', 'autorisk_vectorizer.pkl', 'model_info.json']:
    if os.path.exists(filename):
        size = os.path.getsize(filename) / 1024
        print(f"   {filename:30s}: {size:8.2f} KB")


"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                    CELL 14: INTERACTIVE PREDICTIONS                          ║
╚══════════════════════════════════════════════════════════════════════════════╗
"""

def predict_severity(description):
    """Predict severity for a CVE description."""
    # Clean the text
    cleaned = clean_text(description)

    # Vectorize
    features = vectorizer.transform([cleaned])

    # Predict
    prediction = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]

    return prediction, probabilities, model.classes_

print("=" * 80)
print("                        SAMPLE PREDICTIONS")
print("=" * 80)

# Test cases covering different severities
test_cases = [
    {
        "description": "Remote code execution vulnerability allows unauthenticated attackers to execute arbitrary system commands through malicious file upload leading to complete server compromise and data breach",
        "expected": "CRITICAL"
    },
    {
        "description": "Buffer overflow in network service allows remote attackers to cause denial of service or potentially execute code through specially crafted packets",
        "expected": "HIGH"
    },
    {
        "description": "Cross-site scripting vulnerability in comment field enables attackers to inject malicious JavaScript that executes in victim browsers",
        "expected": "MEDIUM"
    },
    {
        "description": "Information disclosure through verbose error messages reveals internal server paths and configuration details",
        "expected": "LOW"
    },
    {
        "description": "SQL injection in login form allows attackers to bypass authentication and extract sensitive database information including user credentials",
        "expected": "CRITICAL"
    }
]

for i, test_case in enumerate(test_cases, 1):
    desc = test_case['description']
    expected = test_case['expected']

    prediction, probabilities, classes = predict_severity(desc)

    print(f"\n{'─' * 80}")
    print(f"Test Case {i}:")
    print(f"{'─' * 80}")
    print(f"Description: {desc}")
    print(f"\n🎯 Predicted: {prediction} | Expected: {expected} | {'✅ CORRECT' if prediction == expected else '❌ INCORRECT'}")
    print(f"\n📊 Confidence Scores:")

    for severity, prob in zip(classes, probabilities):
        bar_length = int(prob * 40)
        bar = '█' * bar_length + '░' * (40 - bar_length)
        emoji = '🔴' if severity == 'CRITICAL' else '🟠' if severity == 'HIGH' else '🟡' if severity == 'MEDIUM' else '🟢'
        print(f"   {emoji} {severity:10s} [{bar}] {prob*100:5.1f}%")

print("\n" + "=" * 80)


"""
╔══════════════════════════════════════════════════════════════════════════════╗
║                 CELL 15: PROJECT SUMMARY & NEXT STEPS                        ║
╚══════════════════════════════════════════════════════════════════════════════╗
"""

print("\n" + "=" * 80)
print("                    📊 PROJECT COMPLETION SUMMARY")
print("=" * 80)

print(f"\n✅ COMPLETED COMPONENTS (~60% Implementation):\n")

completed_tasks = [
    ("Data Acquisition", "Downloaded/Generated NVD CVE dataset"),
    ("Data Extraction", "Parsed JSON and extracted 600+ CVE records"),
    ("Text Preprocessing", "Cleaned descriptions (URLs, special chars, normalization)"),
    ("Feature Engineering", "TF-IDF vectorization with 3000 features + bigrams"),
    ("Train-Test Split", f"{X_train.shape[0]} train, {X_test.shape[0]} test samples (stratified)"),
    ("Model Training", "Logistic Regression with balanced class weights"),
    ("Model Evaluation", f"{accuracy*100:.2f}% accuracy, {f1:.4f} F1-score"),
    ("Visualizations", "7 comprehensive charts (distribution, confusion matrix, etc.)"),
    ("Model Persistence", "Saved model, vectorizer, and metadata"),
    ("Prediction Pipeline", "Working demo with confidence scores")
]

for i, (task, detail) in enumerate(completed_tasks, 1):
    print(f"   {i:2d}. ✅ {task:20s} - {detail}")

print(f"\n\n🚧 REMAINING WORK (~40% for Production-Ready System):\n")

remaining_tasks = [
    ("Advanced Models (15%)", [
        "• Implement BERT/RoBERTa fine-tuning for improved accuracy",
        "• Add ensemble methods (Random Forest, XGBoost, Voting Classifier)",
        "• Implement deep learning model (LSTM/BiLSTM)",
        "• Create model comparison framework"
    ]),
    ("Hyperparameter Tuning (5%)", [
        "• Grid search / Bayesian optimization",
        "• Cross-validation (K-fold, stratified)",
        "• Feature selection optimization",
        "• Learning curve analysis"
    ]),
    ("Deployment Interface (15%)", [
        "• Build Streamlit web dashboard",
        "• Create REST API with FastAPI",
        "• Add batch prediction functionality",
        "• Real-time prediction endpoint"
    ]),
    ("Production Features (5%)", [
        "• Implement model monitoring & drift detection",
        "• Add explainability (SHAP/LIME)",
        "• Create automated retraining pipeline",
        "• Setup logging and error handling"
    ])
]

for category, tasks in remaining_tasks:
    print(f"\n   📌 {category}:")
    for task in tasks:
        print(f"      {task}")

print(f"\n\n{'=' * 80}")
print(f"                    🎯 CURRENT PERFORMANCE")
print(f"{'=' * 80}")

print(f"\n   Model Type:        Logistic Regression (Baseline)")
print(f"   Training Samples:  {X_train.shape[0]}")
print(f"   Test Samples:      {X_test.shape[0]}")
print(f"   Features:          {X.shape[1]} (TF-IDF)")
print(f"\n   📊 Metrics:")
print(f"      Accuracy:       {accuracy*100:.2f}% {'✅ TARGET MET!' if accuracy >= 0.80 else '⚠️ Below target'}")
print(f"      Precision:      {precision:.4f}")
print(f"      Recall:         {recall:.4f}")
print(f"      F1-Score:       {f1:.4f}")

print(f"\n\n{'=' * 80}")
print(f"                    💡 RECOMMENDED NEXT STEPS")
print(f"{'=' * 80}")

next_steps = [
    "1. Test with real CVE descriptions from your organization",
    "2. Fine-tune BERT model for 85-90% accuracy target",
    "3. Build Streamlit UI for easy team access",
    "4. Implement explainability to understand predictions",
    "5. Create automated pipeline for new CVE ingestion",
    "6. Deploy as microservice with Docker + Kubernetes"
]

for step in next_steps:
    print(f"   {step}")

print(f"\n\n{'=' * 80}")
print(f"🎉 AutoRisk v0.6 - Baseline Implementation Complete!")
print(f"{'=' * 80}")
print(f"\n📂 Generated Files:")
print(f"   • autorisk_model.pkl (trained model)")
print(f"   • autorisk_vectorizer.pkl (TF-IDF vectorizer)")
print(f"   • autorisk_analysis.png (visualizations)")
print(f"   • model_info.json (metadata)")

print(f"\n🚀 Ready to move to Phase 2: Advanced Models & UI Development!")
print(f"{'=' * 80}\n")

💾 Saving trained model and vectorizer...
✅ Saved: autorisk_model.pkl
✅ Saved: autorisk_vectorizer.pkl
✅ Saved: model_info.json

📦 Model files ready for deployment!

📊 File Sizes:
   autorisk_model.pkl            :    11.76 KB
   autorisk_vectorizer.pkl       :    15.33 KB
   model_info.json               :     0.33 KB
                        SAMPLE PREDICTIONS

────────────────────────────────────────────────────────────────────────────────
Test Case 1:
────────────────────────────────────────────────────────────────────────────────
Description: Remote code execution vulnerability allows unauthenticated attackers to execute arbitrary system commands through malicious file upload leading to complete server compromise and data breach

🎯 Predicted: CRITICAL | Expected: CRITICAL | ✅ CORRECT

📊 Confidence Scores:
   🔴 CRITICAL   [█████████████████████████████░░░░░░░░░░░]  74.2%
   🟠 HIGH       [█████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░]  13.2%
   🟢 LOW        [██░░░░░░░░░░░░░░░░░░░░░░░░░░░░░