In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from collections import defaultdict
import json
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

print("üì¶ Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load processed data
courses = pd.read_csv('../data/processed/courses_with_id.csv')
users = pd.read_csv('../data/processed/synthetic_users.csv')
interactions = pd.read_csv('../data/processed/user_interactions.csv')

print(f"‚úÖ Loaded data:")
print(f"   Courses: {len(courses):,}")
print(f"   Users: {len(users):,}")
print(f"   Interactions: {len(interactions):,}")
print(f"   Sparsity: {(1 - len(interactions) / (len(users) * len(courses))) * 100:.2f}%")

## 2. Train/Test Split

Split interactions into training (80%) and testing (20%) sets.
We use stratified split to ensure each user has data in both sets.

In [None]:
print("üîÄ Splitting data into train/test sets...\n")

# Group by user to ensure each user has interactions in both train and test
train_interactions = []
test_interactions = []

for user_id in interactions['user_id'].unique():
    user_data = interactions[interactions['user_id'] == user_id]
    
    # Split 80/20 for this user
    if len(user_data) >= 5:  # Only split if user has enough interactions
        train, test = train_test_split(user_data, test_size=0.2, random_state=42)
        train_interactions.append(train)
        test_interactions.append(test)
    else:
        # For users with <5 interactions, put all in train
        train_interactions.append(user_data)

train_df = pd.concat(train_interactions, ignore_index=True)
test_df = pd.concat(test_interactions, ignore_index=True)

print(f"‚úÖ Split complete:")
print(f"   Training interactions: {len(train_df):,} ({len(train_df)/len(interactions)*100:.1f}%)")
print(f"   Testing interactions: {len(test_df):,} ({len(test_df)/len(interactions)*100:.1f}%)")
print(f"   Users in train: {train_df['user_id'].nunique()}")
print(f"   Users in test: {test_df['user_id'].nunique()}")
print(f"   Courses in train: {train_df['course_id'].nunique()}")
print(f"   Courses in test: {test_df['course_id'].nunique()}")

## 3. Content-Based Filtering Model

Uses TF-IDF to vectorize course skills and computes similarity between courses.

In [None]:
print("üéØ Building Content-Based Filtering Model...\n")

# Create TF-IDF vectors from course skills
vectorizer = TfidfVectorizer(
    max_features=500,  # Top 500 skills
    min_df=2,          # Skill must appear in at least 2 courses
    ngram_range=(1, 2) # Unigrams and bigrams
)

# Fit on skills_cleaned column
skill_vectors = vectorizer.fit_transform(courses['skills_cleaned'].fillna(''))

print(f"   TF-IDF matrix shape: {skill_vectors.shape}")
print(f"   Vocabulary size: {len(vectorizer.vocabulary_)}")

# Compute course-course similarity matrix
print(f"\n   Computing course similarity matrix...")
course_similarity = cosine_similarity(skill_vectors)

print(f"   ‚úÖ Similarity matrix: {course_similarity.shape}")
print(f"   Avg similarity: {course_similarity.mean():.4f}")
print(f"   Max similarity (excluding self): {np.sort(course_similarity.flatten())[-len(courses)-1]:.4f}")

### 3.1 Content-Based Recommendation Function

In [None]:
def content_based_recommend(user_id, train_data, courses_df, similarity_matrix, top_n=10):
    """
    Recommend courses based on content similarity to courses user has interacted with.
    
    Args:
        user_id: User ID
        train_data: Training interaction data
        courses_df: Course dataframe
        similarity_matrix: Course-course similarity matrix
        top_n: Number of recommendations
    
    Returns:
        List of (course_id, score) tuples
    """
    # Get courses user has interacted with
    user_courses = train_data[train_data['user_id'] == user_id]['course_id'].values
    
    if len(user_courses) == 0:
        return []  # Cold start - no history
    
    # Compute scores for all courses
    course_scores = np.zeros(len(courses_df))
    
    for course_id in user_courses:
        # Add similarity scores from this course
        course_scores += similarity_matrix[course_id]
    
    # Average the scores
    course_scores = course_scores / len(user_courses)
    
    # Remove courses user already interacted with
    course_scores[user_courses] = -1
    
    # Get top N
    top_indices = np.argsort(course_scores)[::-1][:top_n]
    recommendations = [(idx, course_scores[idx]) for idx in top_indices if course_scores[idx] > 0]
    
    return recommendations

# Test with a sample user
sample_user = 0
sample_recs = content_based_recommend(sample_user, train_df, courses, course_similarity, top_n=5)

print(f"\nüìù Sample recommendations for User {sample_user}:")
for course_id, score in sample_recs:
    course_name = courses[courses['course_id'] == course_id]['Course Name'].values[0]
    print(f"   {course_name[:50]}: {score:.4f}")

## 4. Collaborative Filtering Model

Uses matrix factorization (SVD) to learn latent features from user-course interactions.

In [None]:
print("ü§ù Building Collaborative Filtering Model...\n")

try:
    from surprise import SVD, Dataset, Reader
    from surprise.model_selection import cross_validate
    print("   ‚úÖ Surprise library loaded")
except ImportError:
    print("   ‚ö†Ô∏è  Installing surprise library...")
    import subprocess
    subprocess.check_call(["pip", "install", "scikit-surprise"])
    from surprise import SVD, Dataset, Reader
    from surprise.model_selection import cross_validate
    print("   ‚úÖ Surprise library installed and loaded")

# Prepare data for Surprise
reader = Reader(rating_scale=(0, 1))  # Binary interactions

# Create dataset from train_df
train_surprise = Dataset.load_from_df(
    train_df[['user_id', 'course_id', 'interaction']], 
    reader
)

# Build full trainset
trainset = train_surprise.build_full_trainset()

print(f"   Training set: {trainset.n_users} users, {trainset.n_items} courses")
print(f"   Total ratings: {trainset.n_ratings}")

# Train SVD model
print(f"\n   Training SVD model...")
svd_model = SVD(
    n_factors=50,      # Latent features
    n_epochs=20,       # Training iterations
    lr_all=0.005,      # Learning rate
    reg_all=0.02,      # Regularization
    random_state=42
)

svd_model.fit(trainset)

print(f"   ‚úÖ SVD model trained!")
print(f"   Latent factors: {svd_model.n_factors}")

### 4.1 Collaborative Filtering Recommendation Function

In [None]:
def collaborative_recommend(user_id, train_data, courses_df, model, top_n=10):
    """
    Recommend courses using collaborative filtering.
    
    Args:
        user_id: User ID
        train_data: Training interaction data
        courses_df: Course dataframe
        model: Trained SVD model
        top_n: Number of recommendations
    
    Returns:
        List of (course_id, score) tuples
    """
    # Get courses user has already interacted with
    user_courses = set(train_data[train_data['user_id'] == user_id]['course_id'].values)
    
    # Predict for all courses
    all_courses = courses_df['course_id'].values
    predictions = []
    
    for course_id in all_courses:
        if course_id not in user_courses:
            pred = model.predict(user_id, course_id)
            predictions.append((course_id, pred.est))
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return predictions[:top_n]

# Test with sample user
sample_recs_cf = collaborative_recommend(sample_user, train_df, courses, svd_model, top_n=5)

print(f"\nüìù CF recommendations for User {sample_user}:")
for course_id, score in sample_recs_cf:
    course_name = courses[courses['course_id'] == course_id]['Course Name'].values[0]
    print(f"   {course_name[:50]}: {score:.4f}")

## 5. Hybrid Recommender Model

Combines content-based and collaborative filtering predictions.

In [None]:
print("üîó Building Hybrid Recommender Model...\n")

def hybrid_recommend(user_id, train_data, courses_df, similarity_matrix, cf_model, 
                     content_weight=0.6, cf_weight=0.4, top_n=10):
    """
    Hybrid recommender combining content-based and collaborative filtering.
    
    Args:
        user_id: User ID
        train_data: Training interaction data
        courses_df: Course dataframe
        similarity_matrix: Course-course similarity matrix
        cf_model: Trained collaborative filtering model
        content_weight: Weight for content-based score (0-1)
        cf_weight: Weight for CF score (0-1)
        top_n: Number of recommendations
    
    Returns:
        List of (course_id, score, content_score, cf_score) tuples
    """
    # Get recommendations from both models
    content_recs = content_based_recommend(user_id, train_data, courses_df, 
                                          similarity_matrix, top_n=50)
    cf_recs = collaborative_recommend(user_id, train_data, courses_df, 
                                     cf_model, top_n=50)
    
    # Normalize scores to 0-1 range
    if content_recs:
        max_content = max(score for _, score in content_recs)
        content_dict = {cid: score/max_content for cid, score in content_recs}
    else:
        content_dict = {}
    
    if cf_recs:
        max_cf = max(score for _, score in cf_recs)
        min_cf = min(score for _, score in cf_recs)
        cf_dict = {cid: (score - min_cf)/(max_cf - min_cf) if max_cf > min_cf else 0 
                   for cid, score in cf_recs}
    else:
        cf_dict = {}
    
    # Combine scores
    all_courses = set(content_dict.keys()) | set(cf_dict.keys())
    hybrid_scores = []
    
    for course_id in all_courses:
        content_score = content_dict.get(course_id, 0)
        cf_score = cf_dict.get(course_id, 0)
        
        # Weighted combination
        final_score = content_weight * content_score + cf_weight * cf_score
        
        hybrid_scores.append((course_id, final_score, content_score, cf_score))
    
    # Sort by final score
    hybrid_scores.sort(key=lambda x: x[1], reverse=True)
    
    return hybrid_scores[:top_n]

# Test hybrid model
sample_recs_hybrid = hybrid_recommend(sample_user, train_df, courses, course_similarity, svd_model, top_n=5)

print(f"üìù Hybrid recommendations for User {sample_user}:")
print(f"   (Weights: {0.6:.1f} content + {0.4:.1f} CF)\n")
for course_id, final_score, content_score, cf_score in sample_recs_hybrid:
    course_name = courses[courses['course_id'] == course_id]['Course Name'].values[0]
    print(f"   {course_name[:40]}")
    print(f"      Final: {final_score:.4f} (Content: {content_score:.4f}, CF: {cf_score:.4f})")

## 6. Evaluation Metrics

Evaluate all three models using Precision@K and Recall@K.

In [None]:
def evaluate_model(recommend_function, test_data, k_values=[5, 10]):
    """
    Evaluate recommendation model using Precision@K and Recall@K.
    
    Args:
        recommend_function: Function that returns recommendations for a user
        test_data: Test interaction dataframe
        k_values: List of K values to evaluate
    
    Returns:
        Dictionary with precision and recall for each K
    """
    results = {k: {'precision': [], 'recall': []} for k in k_values}
    
    # Group test data by user
    test_by_user = test_data.groupby('user_id')['course_id'].apply(set).to_dict()
    
    for user_id in test_by_user:
        # Get recommendations
        try:
            recommendations = recommend_function(user_id)
            
            if not recommendations:
                continue
            
            # Get actual courses from test set
            actual_courses = test_by_user[user_id]
            
            for k in k_values:
                # Get top K recommendations
                if len(recommendations[0]) == 4:  # Hybrid format
                    top_k = [rec[0] for rec in recommendations[:k]]
                else:  # Regular format
                    top_k = [rec[0] for rec in recommendations[:k]]
                
                # Calculate hits
                hits = len(set(top_k) & actual_courses)
                
                # Precision@K = hits / K
                precision = hits / k if k > 0 else 0
                
                # Recall@K = hits / total_relevant
                recall = hits / len(actual_courses) if len(actual_courses) > 0 else 0
                
                results[k]['precision'].append(precision)
                results[k]['recall'].append(recall)
        except Exception as e:
            continue
    
    # Average results
    for k in k_values:
        results[k]['precision'] = np.mean(results[k]['precision']) if results[k]['precision'] else 0
        results[k]['recall'] = np.mean(results[k]['recall']) if results[k]['recall'] else 0
    
    return results

print("üìä Evaluating models on test set...\n")
print("   This may take a few minutes...\n")

### 6.1 Evaluate Content-Based Model

In [None]:
print("üéØ Evaluating Content-Based Model...")

content_results = evaluate_model(
    lambda uid: content_based_recommend(uid, train_df, courses, course_similarity, top_n=10),
    test_df
)

print("\n   Content-Based Results:")
for k, metrics in content_results.items():
    print(f"      @{k}: Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")

### 6.2 Evaluate Collaborative Filtering Model

In [None]:
print("ü§ù Evaluating Collaborative Filtering Model...")

cf_results = evaluate_model(
    lambda uid: collaborative_recommend(uid, train_df, courses, svd_model, top_n=10),
    test_df
)

print("\n   Collaborative Filtering Results:")
for k, metrics in cf_results.items():
    print(f"      @{k}: Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")

### 6.3 Evaluate Hybrid Model

In [None]:
print("üîó Evaluating Hybrid Model...")

hybrid_results = evaluate_model(
    lambda uid: hybrid_recommend(uid, train_df, courses, course_similarity, svd_model, 
                                 content_weight=0.6, cf_weight=0.4, top_n=10),
    test_df
)

print("\n   Hybrid Model Results:")
for k, metrics in hybrid_results.items():
    print(f"      @{k}: Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")

## 7. Compare All Models

In [None]:
print("\n" + "="*60)
print("üìä MODEL COMPARISON")
print("="*60)

comparison_df = pd.DataFrame([
    {
        'Model': 'Content-Based',
        'Precision@5': content_results[5]['precision'],
        'Recall@5': content_results[5]['recall'],
        'Precision@10': content_results[10]['precision'],
        'Recall@10': content_results[10]['recall']
    },
    {
        'Model': 'Collaborative Filtering',
        'Precision@5': cf_results[5]['precision'],
        'Recall@5': cf_results[5]['recall'],
        'Precision@10': cf_results[10]['precision'],
        'Recall@10': cf_results[10]['recall']
    },
    {
        'Model': 'Hybrid (0.6 + 0.4)',
        'Precision@5': hybrid_results[5]['precision'],
        'Recall@5': hybrid_results[5]['recall'],
        'Precision@10': hybrid_results[10]['precision'],
        'Recall@10': hybrid_results[10]['recall']
    }
])

print("\n", comparison_df.to_string(index=False))

# Identify best model
best_p5 = comparison_df.loc[comparison_df['Precision@5'].idxmax(), 'Model']
best_r5 = comparison_df.loc[comparison_df['Recall@5'].idxmax(), 'Model']

print(f"\nüèÜ BEST PERFORMERS:")
print(f"   Precision@5: {best_p5}")
print(f"   Recall@5: {best_r5}")

## 8. Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Precision comparison
ax1 = axes[0]
x = np.arange(len(comparison_df))
width = 0.35

ax1.bar(x - width/2, comparison_df['Precision@5'], width, label='Precision@5', color='#2E86AB')
ax1.bar(x + width/2, comparison_df['Precision@10'], width, label='Precision@10', color='#6A994E')
ax1.set_xlabel('Model', fontweight='bold')
ax1.set_ylabel('Precision', fontweight='bold')
ax1.set_title('Precision Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Recall comparison
ax2 = axes[1]
ax2.bar(x - width/2, comparison_df['Recall@5'], width, label='Recall@5', color='#F18F01')
ax2.bar(x + width/2, comparison_df['Recall@10'], width, label='Recall@10', color='#A23B72')
ax2.set_xlabel('Model', fontweight='bold')
ax2.set_ylabel('Recall', fontweight='bold')
ax2.set_title('Recall Comparison')
ax2.set_xticks(x)
ax2.set_xticklabels(comparison_df['Model'], rotation=15, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 9. Save Models

In [None]:
print("üíæ Saving models and results...\n")

import pickle
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save content-based components
with open('../models/content_based_model.pkl', 'wb') as f:
    pickle.dump({
        'vectorizer': vectorizer,
        'similarity_matrix': course_similarity
    }, f)
print("   ‚úÖ Saved: content_based_model.pkl")

# Save CF model
with open('../models/cf_model.pkl', 'wb') as f:
    pickle.dump(svd_model, f)
print("   ‚úÖ Saved: cf_model.pkl")

# Save evaluation results
evaluation_results = {
    'content_based': content_results,
    'collaborative_filtering': cf_results,
    'hybrid': hybrid_results,
    'comparison_table': comparison_df.to_dict('records'),
    'hybrid_weights': {'content': 0.6, 'cf': 0.4}
}

with open('../models/evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f, indent=2)
print("   ‚úÖ Saved: evaluation_results.json")

# Save comparison table
comparison_df.to_csv('../models/evaluation_results.csv', index=False)
print("   ‚úÖ Saved: evaluation_results.csv")

print(f"\n‚ú® ALL MODELS SAVED!")
print(f"   Location: ../models/")

## üìã Summary

### What We Built
1. ‚úÖ **Content-Based Model**: TF-IDF + cosine similarity on course skills
2. ‚úÖ **Collaborative Filtering**: SVD matrix factorization on user interactions
3. ‚úÖ **Hybrid Model**: Weighted combination (60% content + 40% CF)

### Evaluation
- Tested on 20% held-out interactions
- Metrics: Precision@K and Recall@K
- Compared all three models

### Next Steps
- Deploy the best-performing model
- Build API endpoints for recommendations
- Add explainability (why this recommendation?)
- Collect user feedback to improve models üöÄ