# Step 4: Career Prediction Model

This notebook builds a machine learning model to predict career paths for students based on:
- Academic performance (GPA, attendance, grades)
- Skill counts and course completion
- Skill gap analysis from Step 2
- Student embeddings (PCA-reduced)

**Career Classes**: Data, Machine Learning, Cloud, Cybersecurity, Network, DevOps, Software, Other

In [None]:
# Cell 1: Imports & seeds
import os, json, pickle, joblib, re
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)

print("✅ Imports complete")

In [None]:
# Cell 2: Load data
BASE = Path(".")

# Load students dataset
df = pd.read_csv(BASE / "digital_twin_students_1500_cleaned.csv")
print(f"Loaded {len(df)} students")

# Load skill gap profiles from Step 2
with open(BASE / "skill_gap_profiles" / "student_profiles.json", "r") as f:
    profiles = json.load(f)
print(f"Loaded {len(profiles)} profiles")

# Load student embeddings
with open(BASE / "embeddings" / "embeddings_students.pkl", "rb") as f:
    emb_data = pickle.load(f)
    student_ids = emb_data['ids']
    student_embeddings = np.vstack(emb_data['embeddings'])
print(f"Loaded embeddings: {student_embeddings.shape}")

print("\n✅ Data loaded successfully")

In [None]:
# Cell 3: Create career labels (pseudo-labels from top job matches)

def map_job_to_class(job_title):
    """Map job title to one of 8 career classes"""
    t = str(job_title).lower()
    
    if any(k in t for k in ["data", "analyst", "data scientist", "etl", "big data", "data engineer"]):
        return "Data"
    if any(k in t for k in ["machine learning", "ml", "deep learning", "ai", "ml engineer"]):
        return "Machine Learning"
    if any(k in t for k in ["cloud", "aws", "azure", "gcp", "cloud engineer", "cloud architect"]):
        return "Cloud"
    if any(k in t for k in ["security", "cyber", "penetration", "infosec", "security analyst"]):
        return "Cybersecurity"
    if any(k in t for k in ["network", "routing", "switching", "network engineer"]):
        return "Network"
    if any(k in t for k in ["devops", "sre", "ci/cd", "infrastructure"]):
        return "DevOps"
    if any(k in t for k in ["developer", "software", "backend", "frontend", "full stack", "engineer"]):
        return "Software"
    return "Other"

# Build labels from profiles (top job match)
label_rows = []
profiles_map = {p['student_id']: p for p in profiles}

for sid in df['StudentID'].astype(str).tolist():
    p = profiles_map.get(sid, {})
    top_job = None
    
    if p.get("best_job_matches"):
        item = p["best_job_matches"][0]
        top_job = item.get("job_title") if isinstance(item, dict) else item
    
    label = map_job_to_class(top_job or "")
    label_rows.append({'StudentID': sid, 'career_label': label})

labels_df = pd.DataFrame(label_rows)
df = df.merge(labels_df, on="StudentID", how="left")

# Check label distribution
print("\nLabel Distribution:")
print(df['career_label'].value_counts())
print("\nLabel Percentages:")
print(df['career_label'].value_counts(normalize=True).round(3))

print("\n✅ Labels created")

In [None]:
# Cell 4: Feature engineering

# 1. Academic features
df['GPA'] = pd.to_numeric(df['GPA'], errors='coerce').fillna(df['GPA'].mean())
df['AttendancePercent'] = pd.to_numeric(df.get('AttendancePercent', 0), errors='coerce').fillna(80)
df['FailedCourses'] = pd.to_numeric(df.get('FailedCourses', 0), errors='coerce').fillna(0)

# 2. Skill and course counts
def list_count(cell):
    """Count items in semicolon/comma separated list"""
    if pd.isna(cell): 
        return 0
    s = str(cell).strip()
    if s == "" or s.lower() == "nan": 
        return 0
    return len([x for x in re.split(r'[;,|/]+', s) if x.strip()])

df['num_skills'] = df.get('Skills', "").apply(list_count)
df['num_courses_completed'] = df.get('CoursesCompleted', "").apply(list_count)

# 3. Major average (from grade columns if available)
subject_cols = [c for c in df.columns if c.lower().endswith("_grade") or "grade" in c.lower()]
if subject_cols:
    df['major_avg'] = df[subject_cols].replace(-1, np.nan).mean(axis=1).fillna(df['GPA'])
else:
    df['major_avg'] = df['GPA']

# 4. Gap features from Step 2 profiles
def get_profile(sid):
    return profiles_map.get(str(sid), {})

df['num_missing_skills'] = df['StudentID'].apply(
    lambda s: len(get_profile(s).get('skill_gaps', {}).get('missing_skills', []))
)

def top_priority_mean(sid):
    arr = get_profile(sid).get('skill_gaps', {}).get('priority_skills', [])
    if not arr: 
        return 0.0
    vals = [x.get('priority_score', 0) for x in arr]
    return float(np.mean(vals))

df['top_missing_priority'] = df['StudentID'].apply(top_priority_mean)

# 5. Align and reduce embeddings (PCA to 32 dims)
from sklearn.decomposition import PCA

# Build mapping from embedding ids to embeddings
emb_map = {str(sid): emb for sid, emb in zip(student_ids, student_embeddings)}

# Create matrix aligned to df order
emb_matrix = np.vstack([
    emb_map.get(str(sid), np.zeros(student_embeddings.shape[1])) 
    for sid in df['StudentID']
])

# PCA reduction
pca = PCA(n_components=32, random_state=42)
emb_pca = pca.fit_transform(emb_matrix)
print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.3f}")

# Add PCA features to dataframe
for i in range(emb_pca.shape[1]):
    df[f'emb_pca_{i}'] = emb_pca[:, i]

print(f"\n✅ Features created")
print(f"Total features: {8 + 32} (8 academic/gap + 32 embedding)")

In [None]:
# Cell 5: Encode labels and train/test split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Define feature columns
feature_cols = [
    'GPA', 'major_avg', 'AttendancePercent', 'FailedCourses',
    'num_skills', 'num_courses_completed', 
    'num_missing_skills', 'top_missing_priority'
] + [c for c in df.columns if c.startswith('emb_pca_')]

# Prepare feature matrix and labels
X = df[feature_cols].fillna(0)
le = LabelEncoder()
y = le.fit_transform(df['career_label'].fillna('Other'))

print(f"Feature matrix shape: {X.shape}")
print(f"Classes: {le.classes_}")
print(f"Number of classes: {len(le.classes_)}")

# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"\nTrain size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

# Verify no NaN or inf
assert not np.any(np.isnan(X.values)), "Found NaN values in features!"
assert np.isfinite(X.values).all(), "Found inf values in features!"

print("\n✅ Data prepared for training")

In [None]:
# Cell 6: Train baseline models (RandomForest + XGBoost)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

print("Training RandomForest baseline...")
rf = RandomForestClassifier(
    n_estimators=200, 
    class_weight='balanced', 
    random_state=42, 
    n_jobs=-1
)
rf.fit(X_train, y_train)
yhat_rf = rf.predict(X_test)

print(f"\nRandomForest Results:")
print(f"  Accuracy: {accuracy_score(y_test, yhat_rf):.3f}")
print(f"  Macro F1: {f1_score(y_test, yhat_rf, average='macro'):.3f}")

print("\n" + "="*60)
print("Training XGBoost model...")
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)
yhat_xgb = xgb.predict(X_test)

print(f"\nXGBoost Results:")
print(f"  Accuracy: {accuracy_score(y_test, yhat_xgb):.3f}")
print(f"  Macro F1: {f1_score(y_test, yhat_xgb, average='macro'):.3f}")

print("\n" + "="*60)
print("\nDetailed Classification Report (XGBoost):")
print(classification_report(y_test, yhat_xgb, target_names=le.classes_))

print("\n✅ Models trained")

In [None]:
# Cell 7: Cross-validation (optional - for more robust evaluation)
from sklearn.model_selection import cross_val_score

print("Running 5-fold cross-validation on XGBoost...")
cv_scores = cross_val_score(
    xgb, X, y, cv=5, scoring='f1_macro', n_jobs=-1
)

print(f"\nCross-validation F1 scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

print("\n✅ Cross-validation complete")

In [None]:
# Cell 8: Evaluation and confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
ConfusionMatrixDisplay.from_estimator(
    xgb, X_test, y_test, 
    display_labels=le.classes_,
    cmap='Blues',
    xticks_rotation=45,
    ax=ax
)
plt.title('Confusion Matrix - XGBoost Career Prediction')
plt.tight_layout()
plt.savefig('models/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ Confusion matrix saved to models/confusion_matrix.png")

In [None]:
# Cell 9: Model explainability (SHAP)
import shap

print("Computing SHAP values (this may take a minute)...")
explainer = shap.TreeExplainer(xgb)
sample_data = X_train.sample(200, random_state=42)
shap_values = explainer.shap_values(sample_data)

# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(
    shap_values, 
    sample_data, 
    feature_names=X_train.columns,
    show=False
)
plt.tight_layout()
plt.savefig('models/shap_summary.png', dpi=150, bbox_inches='tight')
plt.show()

# Feature importance from model
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

print("\n✅ SHAP analysis complete")

In [None]:
# Cell 10: Save artifacts
import os

os.makedirs("models", exist_ok=True)

# Save model and transformers
joblib.dump(xgb, "models/career_model_xgb.pkl")
joblib.dump(le, "models/label_encoder.pkl")
joblib.dump(pca, "models/emb_pca.pkl")
joblib.dump(feature_cols, "models/feature_list.pkl")

# Save feature matrix for API use
features_all = df[['StudentID'] + feature_cols].copy()
features_all.to_csv("models/features_all.csv", index=False)

# Save feature importance
feature_importance.to_csv("models/feature_importance.csv", index=False)

print("✅ Artifacts saved to models/ directory:")
print("  - career_model_xgb.pkl")
print("  - label_encoder.pkl")
print("  - emb_pca.pkl")
print("  - feature_list.pkl")
print("  - features_all.csv")
print("  - feature_importance.csv")
print("  - confusion_matrix.png")
print("  - shap_summary.png")

In [None]:
# Cell 11: Prediction function and examples

def predict_career(student_id):
    """Predict career path for a single student"""
    row = df[df['StudentID'] == str(student_id)]
    
    if row.shape[0] == 0:
        return {"student_id": student_id, "error": "student not found"}
    
    X_row = row[feature_cols].fillna(0)
    probs = xgb.predict_proba(X_row)[0]
    idx = np.argmax(probs)
    label = le.inverse_transform([idx])[0]
    
    # Get top 3 predictions
    top3_idx = probs.argsort()[-3:][::-1]
    top3 = [le.inverse_transform([i])[0] for i in top3_idx]
    
    prob_map = dict(zip(le.classes_, [float(p) for p in probs]))
    
    return {
        "student_id": student_id,
        "predicted_career": label,
        "confidence": float(probs[idx]),
        "top_3_careers": top3,
        "probabilities": prob_map
    }

# Test predictions on sample students
print("Sample Predictions:\n")
sample_ids = df['StudentID'].sample(5, random_state=42).tolist()

for sid in sample_ids:
    result = predict_career(sid)
    print(f"Student {sid}:")
    print(f"  Predicted: {result['predicted_career']} ({result['confidence']:.2%})")
    print(f"  Top 3: {', '.join(result['top_3_careers'])}")
    print()

print("\n✅ Prediction function ready")