# Step 5: Personalized Roadmap Generator

This notebook implements the logic for generating personalized learning roadmaps for students. 
It covers:
1. Loading required data (Students, Profiles, Recommendations, Models).
2. Defining Project Templates for each career path.
3. Determining the best career path for a student.
4. Generating a 3-stage roadmap (Beginner, Intermediate, Advanced).
5. Visualizing the roadmap.

In [None]:
import pandas as pd
import numpy as np
import json
import joblib
import os
from pathlib import Path

# Setup paths
BASE_DIR = Path(".")
MODELS_DIR = BASE_DIR / "models"
PROFILES_DIR = BASE_DIR / "skill_gap_profiles"
RECS_DIR = BASE_DIR / "recommendations"
ROADMAPS_DIR = BASE_DIR / "roadmaps"

os.makedirs(ROADMAPS_DIR, exist_ok=True)

print("Directories set up.")

## 1. Load Data & Validation

In [None]:
def validate_files():
    required_files = [
        BASE_DIR / "digital_twin_students_1500_cleaned.csv",
        PROFILES_DIR / "student_profiles.json",
        RECS_DIR / "recommendations.json",
        MODELS_DIR / "career_model_xgb.pkl",
        MODELS_DIR / "label_encoder.pkl",
        MODELS_DIR / "feature_list.pkl",
        MODELS_DIR / "features_all.csv"
    ]
    
    missing = [f for f in required_files if not f.exists()]
    if missing:
        raise FileNotFoundError(f"Missing required files: {missing}")
    print("All required files found.")

validate_files()

In [None]:
# Load Data
print("Loading data...")
df_students = pd.read_csv(BASE_DIR / "digital_twin_students_1500_cleaned.csv", low_memory=False)

with open(PROFILES_DIR / "student_profiles.json", "r") as f:
    profiles = json.load(f)
profiles_map = {p['student_id']: p for p in profiles}

with open(RECS_DIR / "recommendations.json", "r") as f:
    recommendations = json.load(f)
recs_map = {r['student_id']: r for r in recommendations}

# Load Models
model = joblib.load(MODELS_DIR / "career_model_xgb.pkl")
le = joblib.load(MODELS_DIR / "label_encoder.pkl")
feature_cols = joblib.load(MODELS_DIR / "feature_list.pkl")
df_features = pd.read_csv(MODELS_DIR / "features_all.csv")

print(f"Loaded {len(df_students)} students, {len(profiles)} profiles, {len(recommendations)} recommendations.")

## 2. Define Project Templates
Defining dynamic project templates for each career path to ensure roadmaps have concrete projects.

In [None]:
PROJECT_TEMPLATES = {
    "Data": {
        "Beginner": {"title": "Exploratory Data Analysis on Titanic Dataset", "skills": ["python", "pandas", "matplotlib"], "type": "Analysis"},
        "Intermediate": {"title": "E-commerce Sales Dashboard", "skills": ["sql", "tableau", "data cleaning"], "type": "Visualization"},
        "Advanced": {"title": "Predictive Customer Churn Model", "skills": ["scikit-learn", "feature engineering", "modeling"], "type": "ML"}
    },
    "Machine Learning": {
        "Beginner": {"title": "House Price Prediction", "skills": ["python", "regression", "scikit-learn"], "type": "Regression"},
        "Intermediate": {"title": "Image Classification with CNNs", "skills": ["tensorflow", "deep learning", "cnn"], "type": "Computer Vision"},
        "Advanced": {"title": "End-to-End NLP Chatbot", "skills": ["nlp", "transformers", "deployment"], "type": "NLP"}
    },
    "Cloud": {
        "Beginner": {"title": "Static Website Hosting on AWS S3", "skills": ["aws", "s3", "dns"], "type": "Infrastructure"},
        "Intermediate": {"title": "Serverless API with Lambda & DynamoDB", "skills": ["aws lambda", "api gateway", "nosql"], "type": "Backend"},
        "Advanced": {"title": "Multi-Tier Microservices Architecture", "skills": ["kubernetes", "docker", "terraform"], "type": "DevOps"}
    },
    "Cybersecurity": {
        "Beginner": {"title": "Network Traffic Analysis with Wireshark", "skills": ["networking", "wireshark", "protocols"], "type": "Analysis"},
        "Intermediate": {"title": "Vulnerability Scanner Implementation", "skills": ["python", "security", "scripting"], "type": "Security Tool"},
        "Advanced": {"title": "Simulated Penetration Test Report", "skills": ["metasploit", "ethical hacking", "reporting"], "type": "PenTesting"}
    },
    "Software": {
        "Beginner": {"title": "Personal Portfolio Website", "skills": ["html", "css", "javascript"], "type": "Frontend"},
        "Intermediate": {"title": "REST API for Task Management", "skills": ["nodejs", "express", "mongodb"], "type": "Backend"},
        "Advanced": {"title": "Full Stack E-commerce Platform", "skills": ["react", "redux", "stripe integration"], "type": "Full Stack"}
    },
    "Network": {
        "Beginner": {"title": "Home Network Setup & Configuration", "skills": ["networking", "routers", "ip addressing"], "type": "Setup"},
        "Intermediate": {"title": "Cisco Packet Tracer Simulation", "skills": ["cisco", "routing", "switching"], "type": "Simulation"},
        "Advanced": {"title": "Network Automation with Python", "skills": ["python", "ansible", "network automation"], "type": "Automation"}
    },
    "DevOps": {
        "Beginner": {"title": "Dockerizing a Web Application", "skills": ["docker", "containers", "linux"], "type": "Containerization"},
        "Intermediate": {"title": "CI/CD Pipeline with Jenkins", "skills": ["jenkins", "git", "automation"], "type": "CI/CD"},
        "Advanced": {"title": "Infrastructure as Code with Terraform", "skills": ["terraform", "aws", "iac"], "type": "IaC"}
    }
}

def get_project_template(career, stage):
    # Fallback to Software if career not found
    cat = career if career in PROJECT_TEMPLATES else "Software"
    return PROJECT_TEMPLATES[cat].get(stage, PROJECT_TEMPLATES["Software"][stage])

## 3. Career Determination Logic

In [None]:
def map_job_to_class(job_title):
    t = str(job_title).lower()
    if any(k in t for k in ["data analyst", "data engineer", "data scientist", "etl", "big data", "bi developer", "business intelligence", "tableau", "power bi", "sql developer"]): return "Data"
    if any(k in t for k in ["machine learning", "ml", "deep learning", "ai", "artificial intelligence", "computer vision", "nlp", "data science"]): return "Machine Learning"
    if any(k in t for k in ["cloud", "aws", "azure", "gcp", "kubernetes", "docker", "serverless"]): return "Cloud"
    if any(k in t for k in ["security", "cyber", "penetration", "infosec", "soc analyst", "ethical hacker"]): return "Cybersecurity"
    if any(k in t for k in ["network", "routing", "switching", "cisco", "ccna", "ccnp"]): return "Network"
    if any(k in t for k in ["devops", "sre", "site reliability", "ci/cd", "jenkins", "terraform", "ansible"]): return "DevOps"
    if any(k in t for k in ["developer", "software", "backend", "frontend", "full stack", "engineer", "web", "react", "angular", "node", "java", "python", ".net"]): return "Software"
    return "Other"

def get_predicted_career(student_id):
    row = df_features[df_features['StudentID'] == student_id]
    if row.empty:
        return None, 0.0
    
    X = row[feature_cols]
    probs = model.predict_proba(X)[0]
    idx = np.argmax(probs)
    label = le.inverse_transform([idx])[0]
    confidence = probs[idx]
    return label, confidence

def determine_career_path(student_id):
    # 1. Prediction
    pred_label, conf = get_predicted_career(student_id)
    
    # If high confidence and not 'Other', use it
    if pred_label and pred_label != "Other" and conf > 0.4:
        return pred_label, "Prediction"
    
    # 2. Target Job from Step 2
    profile = profiles_map.get(student_id, {})
    if profile.get("best_job_matches"):
        item = profile["best_job_matches"][0]
        job_title = item.get("job_title") if isinstance(item, dict) else item
        mapped_label = map_job_to_class(job_title)
        if mapped_label != "Other":
            return mapped_label, "Target Job"
            
    # 3. Fallback
    return pred_label if pred_label else "Software", "Fallback"

## 4. Generate Roadmap for Sample Student

In [None]:
def generate_single_roadmap(student_id):
    career, source = determine_career_path(student_id)
    profile = profiles_map.get(student_id, {})
    rec = recs_map.get(student_id, {})
    missing_skills = profile.get('skill_gaps', {}).get('missing_skills', [])
    
    roadmap = {
        "student_id": student_id,
        "career_path": career,
        "career_source": source,
        "generated_at": "2025-11-29",
        "stages": []
    }
    
    # Certification Paths
    cert_paths = {
        "Cloud": ["AWS Cloud Practitioner", "AWS Solutions Architect Associate"],
        "Data": ["Google Data Analytics", "Azure Data Scientist Associate"],
        "Machine Learning": ["TensorFlow Developer Certificate", "AWS Machine Learning Specialty"],
        "Cybersecurity": ["CompTIA Security+", "Certified Ethical Hacker (CEH)"],
        "Network": ["Cisco CCNA", "Cisco CCNP"],
        "DevOps": ["Docker Certified Associate", "CKA (Kubernetes Administrator)"],
        "Software": ["Oracle Java Associate", "Meta Front-End Developer"]
    }
    roadmap["certification_path"] = cert_paths.get(career, [])
    
    # Stages
    stages_config = [
        ("Beginner", "4-6 weeks", "Fundamentals", missing_skills[:5]),
        ("Intermediate", "6-8 weeks", "Application", missing_skills[5:10]),
        ("Advanced", "8-10 weeks", "Mastery", missing_skills[10:15])
    ]
    
    rec_courses = rec.get('recommended_courses', [])
    n_courses = len(rec_courses)
    c_idx = 0
    
    for name, duration, focus, skills in stages_config:
        stage_data = {
            "stage": name,
            "duration": duration,
            "focus": focus,
            "skills": skills,
            "courses": [],
            "projects": []
        }
        
        # Distribute courses
        count = max(1, int(n_courses / 3))
        stage_data["courses"] = rec_courses[c_idx : c_idx + count]
        c_idx += count
        
        # Add Project Template
        proj = get_project_template(career, name)
        stage_data["projects"].append(proj)
        
        roadmap["stages"].append(stage_data)
        
    return roadmap

# Test with S0001
sample_roadmap = generate_single_roadmap("S0001")
print(json.dumps(sample_roadmap, indent=2))