In [1]:
import pandas as pd
import numpy as np
import pickle
import json
from sklearn.decomposition import NMF
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

## 1. LOAD AND PROCESS DATA

In [2]:
df = pd.read_csv('data/semenggoh_guide_survey_data_20250524_233725.csv')
df

Unnamed: 0,Guide_ID,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q24,Q25,Q26,Basic_Skills_Avg,Nature_Knowledge_Avg,Interpretation_Avg,Leadership_Safety_Avg,Cultural_Expertise_Avg,Overall_Average,Recommended_Course
0,GUIDE_0002,2,3,3,4,2,2,2,2,1,...,3,2,2,3.00,2.00,2.50,2.500,2.333333,2.461538,Nature Guide Fundamentals
1,GUIDE_0002,4,3,4,4,4,4,3,3,3,...,3,3,3,3.75,3.50,3.50,3.000,3.166667,3.307692,Advanced Park Guiding: Leadership and Safety
2,GUIDE_0002,4,3,3,2,2,3,3,2,3,...,2,3,2,3.00,2.50,3.25,3.125,2.166667,2.807692,Master Park Guide Certification Program
3,GUIDE_0002,1,2,4,2,2,2,1,3,3,...,1,3,2,2.25,2.00,2.50,2.000,2.000000,2.115385,Nature Guide Fundamentals
4,GUIDE_0002,5,4,4,3,5,5,3,4,4,...,3,4,2,4.00,4.25,4.00,3.250,3.166667,3.615385,Master Park Guide Certification Program
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10573,GUIDE_1323,2,3,4,4,3,2,2,3,3,...,2,3,3,3.25,2.50,3.75,2.750,2.500000,2.884615,Nature Guide Fundamentals
10574,GUIDE_1323,2,4,4,4,4,4,4,4,3,...,3,3,3,3.50,4.00,3.00,3.000,3.333333,3.307692,Eco-Guide Training: Field & Interpretation Skills
10575,GUIDE_1323,4,3,4,4,1,3,1,3,2,...,1,2,2,3.75,2.00,3.25,3.000,2.166667,2.807692,Nature Guide Fundamentals
10576,GUIDE_1324,4,4,4,5,4,4,4,4,4,...,4,3,3,4.25,4.00,3.50,3.125,3.000000,3.461538,Master Park Guide Certification Program


In [3]:
users = sorted(df['Guide_ID'].unique())
courses = sorted(df['Recommended_Course'].unique())

user_to_idx = {user: idx for idx, user in enumerate(users)}
course_to_idx = {course: idx for idx, course in enumerate(courses)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_course = {idx: course for course, idx in course_to_idx.items()}

print(f"Users: {len(users)}, Courses: {len(courses)}")
print("Courses:", courses)

Users: 1323, Courses: 7
Courses: ['Advanced Park Guiding: Leadership and Safety', 'Eco-Guide Training: Field & Interpretation Skills', 'Explore & Lead: Park Guide Mentorship Journey', 'Introduction to Park Guiding', 'Master Park Guide Certification Program', 'Nature Guide Fundamentals', 'Park Guide in Training: Learn from the Pros']


In [4]:
user_item_matrix = np.zeros((len(users), len(courses)))

for _, row in df.iterrows():
    user_idx = user_to_idx[row['Guide_ID']]
    course_idx = course_to_idx[row['Recommended_Course']]
    user_item_matrix[user_idx, course_idx] = 1

print(f"User-item matrix created: {user_item_matrix.shape}")
print(f"Matrix density: {(np.sum(user_item_matrix) / user_item_matrix.size * 100):.1f}%")

User-item matrix created: (1323, 7)
Matrix density: 64.1%


## 2. TRAIN MATRIX FACTORIZATION MODEL

In [5]:
print("\nTraining Matrix Factorization...")

mf_model = NMF(n_components=5, random_state=42, max_iter=200)
user_factors = mf_model.fit_transform(user_item_matrix + 0.01)
item_factors = mf_model.components_.T

print(f"MF Model trained - User factors: {user_factors.shape}, Item factors: {item_factors.shape}")


Training Matrix Factorization...
MF Model trained - User factors: (1323, 5), Item factors: (7, 5)


## 3. TRAIN K-NN MODEL

In [6]:
print("\nTraining k-NN Model...")

knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(user_item_matrix)

print("k-NN Model trained")


Training k-NN Model...
k-NN Model trained


## 4. CREATE HYBRID RECOMMENDER CLASS

In [7]:
class SimpleHybridRecommender:
    def __init__(self, mf_model, knn_model, user_factors, item_factors,
                 user_to_idx, course_to_idx, idx_to_user, idx_to_course, user_item_matrix):
        self.mf_model = mf_model
        self.knn_model = knn_model
        self.user_factors = user_factors
        self.item_factors = item_factors
        self.user_to_idx = user_to_idx
        self.course_to_idx = course_to_idx
        self.idx_to_user = idx_to_user
        self.idx_to_course = idx_to_course
        self.user_item_matrix = user_item_matrix

        self.mf_weight = 0.6
        self.knn_weight = 0.4

    def predict_mf_score(self, user_idx, course_idx):
        """Matrix Factorization prediction"""
        return np.dot(self.user_factors[user_idx], self.item_factors[course_idx])

    def predict_knn_score(self, user_idx, course_idx):
        """k-NN collaborative filtering prediction"""
        distances, indices = self.knn_model.kneighbors([self.user_item_matrix[user_idx]])

        similar_users = indices[0]
        similar_distances = distances[0]

        weighted_sum = 0
        weight_sum = 0

        for i, similar_user_idx in enumerate(similar_users):
            if similar_user_idx != user_idx:
                similarity = 1 / (1 + similar_distances[i])
                rating = self.user_item_matrix[similar_user_idx, course_idx]
                weighted_sum += similarity * rating
                weight_sum += similarity

        return weighted_sum / weight_sum if weight_sum > 0 else 0

    def get_recommendations(self, user_id, n_recommendations=5):
        """Get hybrid recommendations for a user"""
        if user_id not in self.user_to_idx:
            return self._get_popular_courses(n_recommendations)

        user_idx = self.user_to_idx[user_id]
        recommendations = []

        user_courses = self.user_item_matrix[user_idx]

        for course_idx, course_id in self.idx_to_course.items():
            if user_courses[course_idx] == 0:
                mf_score = self.predict_mf_score(user_idx, course_idx)
                knn_score = self.predict_knn_score(user_idx, course_idx)

                hybrid_score = (self.mf_weight * mf_score + self.knn_weight * knn_score)

                recommendations.append({
                    'course_id': course_id,
                    'score': float(hybrid_score),
                    'mf_score': float(mf_score),
                    'knn_score': float(knn_score)
                })

        recommendations.sort(key=lambda x: x['score'], reverse=True)
        return recommendations[:n_recommendations]

    def _get_popular_courses(self, n_recommendations):
        """Fallback for new users - return most popular courses"""
        course_popularity = np.sum(self.user_item_matrix, axis=0)
        popular_indices = np.argsort(course_popularity)[::-1]

        popular_courses = []
        for i in range(min(n_recommendations, len(popular_indices))):
            course_idx = popular_indices[i]
            course_id = self.idx_to_course[course_idx]
            popularity_score = course_popularity[course_idx] / len(self.user_to_idx)

            popular_courses.append({
                'course_id': course_id,
                'score': float(popularity_score),
                'mf_score': float(popularity_score),
                'knn_score': float(popularity_score),
                'reason': 'popular'
            })

        return popular_courses

## 5. CREATE AND TEST HYBRID MODEL

In [8]:
print("\nCreating Hybrid Recommender...")

hybrid_model = SimpleHybridRecommender(
    mf_model, knn_model, user_factors, item_factors,
    user_to_idx, course_to_idx, idx_to_user, idx_to_course, user_item_matrix
)

print("Hybrid model created")


Creating Hybrid Recommender...
Hybrid model created


In [9]:
test_user = users[0]
print(f"\nTesting with user: {test_user}")

current_courses = []
user_idx = user_to_idx[test_user]
for course_idx, rating in enumerate(user_item_matrix[user_idx]):
    if rating > 0:
        current_courses.append(idx_to_course[course_idx])

print("Current courses:", current_courses)


Testing with user: GUIDE_0002
Current courses: ['Advanced Park Guiding: Leadership and Safety', 'Eco-Guide Training: Field & Interpretation Skills', 'Explore & Lead: Park Guide Mentorship Journey', 'Master Park Guide Certification Program', 'Nature Guide Fundamentals']


In [10]:
recommendations = hybrid_model.get_recommendations(test_user, n_recommendations=3)
print("\nRecommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec['course_id']} (Score: {rec['score']:.3f})")


Recommendations:
1. Park Guide in Training: Learn from the Pros (Score: 0.073)
2. Introduction to Park Guiding (Score: 0.016)


## 6. EVALUATE MODEL

In [11]:
print("\nEvaluating Model...")

sample_interactions = []
for user_idx in range(min(50, len(users))):
    for course_idx in range(len(courses)):
        if user_item_matrix[user_idx, course_idx] > 0:
            sample_interactions.append((user_idx, course_idx, 1))

predictions = []
actuals = []

for user_idx, course_idx, actual in sample_interactions[:100]:
    mf_pred = np.dot(user_factors[user_idx], item_factors[course_idx])
    knn_pred = hybrid_model.predict_knn_score(user_idx, course_idx)
    hybrid_pred = 0.6 * mf_pred + 0.4 * knn_pred

    predictions.append(hybrid_pred)
    actuals.append(actual)

mae = mean_absolute_error(actuals, predictions)
print(f"Model MAE: {mae:.4f}")


Evaluating Model...
Model MAE: 0.0526


## 7. EXPORT MODEL

In [12]:
print("\nExporting Model...")

with open('models/hybrid_recommender.pkl', 'wb') as f:
    pickle.dump(hybrid_model, f)

with open('models/user_mappings.pkl', 'wb') as f:
    pickle.dump({'user_to_idx': user_to_idx, 'idx_to_user': idx_to_user}, f)

with open('models/course_mappings.pkl', 'wb') as f:
    pickle.dump({'course_to_idx': course_to_idx, 'idx_to_course': idx_to_course}, f)

metadata = {
    'model_type': 'hybrid_mf_knn',
    'num_users': len(users),
    'num_courses': len(courses),
    'courses': courses,
    'mf_components': 5,
    'knn_neighbors': 10,
    'mf_weight': 0.6,
    'knn_weight': 0.4,
    'mae': mae,
    'matrix_density': float(np.sum(user_item_matrix) / user_item_matrix.size),
    'created_at': pd.Timestamp.now().isoformat()
}

with open('models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Model exported successfully!")
print("Files created:")
print("  - models/hybrid_recommender.pkl")
print("  - models/user_mappings.pkl")
print("  - models/course_mappings.pkl")
print("  - models/model_metadata.json")

print(f"\n Training Complete! MAE: {mae:.4f}")
print("Ready to integrate with FastAPI backend!")


Exporting Model...
Model exported successfully!
Files created:
  - models/hybrid_recommender.pkl
  - models/user_mappings.pkl
  - models/course_mappings.pkl
  - models/model_metadata.json

 Training Complete! MAE: 0.0526
Ready to integrate with FastAPI backend!
