# 📦 Cell 1: Install and Import Packages

In [1]:
!pip install scikit-surprise --quiet

In [5]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
import numpy as np


# 📥 Cell 2: Load Preprocessed Feedback

In [3]:
# --- Step 1: Load Preprocessed Feedback ---

df = pd.read_csv("/kaggle/input/recommender-system-data/preprocessed_feedback.csv")
print(f"✅ Loaded data - shape: {df.shape}")
df[['learner_id', 'trainer_id', 'rating', 'vader_sentiment']].head()

✅ Loaded data - shape: (38444, 17)


Unnamed: 0,learner_id,trainer_id,rating,vader_sentiment
0,1,UK,100.0,Positive
1,2,UK,100.0,Positive
2,3,USA,100.0,Positive
3,4,UK,100.0,Positive
4,5,UK,82.0,Positive


# 🧠 Cell 3: Prepare Dataset for Surprise

In [7]:
# Normalize sentiment scores to range [0, 1]
df['sentiment_norm'] = (df['vader_score'] + 1) / 2  # original range [-1, 1] -> [0, 1]

# --- Step 2: Grid Search to Tune SVD ---

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['learner_id', 'trainer_id', 'rating']], reader)

param_grid = {
    'n_epochs': [20, 40, 60],
    'lr_all': [0.1,0.25,0.5,0.75,0.01, 0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

print("🔍 Performing grid search...")
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

print(f"✅ Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"🏆 Best Params: {gs.best_params['rmse']}")


🔍 Performing grid search...
✅ Best RMSE: 82.0356
🏆 Best Params: {'n_epochs': 20, 'lr_all': 0.1, 'reg_all': 0.02}


# 🤖 Cell 4: Train SVD Recommender

In [8]:
# --- Step 3: Train Final Model ---

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
algo = gs.best_estimator['rmse']
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
print(f"📉 Final RMSE on test set: {rmse:.4f}")

RMSE: 82.1468
📉 Final RMSE on test set: 82.1468


# 🧪 Cell 5: Generate Top-N Recommendations for a Learner

In [9]:
 #--- Step 4: Top-N Recommender with Hybrid Scoring ---

def get_top_n_hybrid(algo, learner_id, all_trainers, rated_trainers, sentiment_dict, n=5, alpha=0.7):
    unseen = [t for t in all_trainers if t not in rated_trainers]
    scored = []
    for trainer in unseen:
        pred_rating = algo.predict(learner_id, trainer).est
        sentiment_score = sentiment_dict.get(trainer, 0.5)  # default neutral if missing
        final_score = alpha * pred_rating + (1 - alpha) * sentiment_score * 5  # scale sentiment to match rating scale
        scored.append((trainer, final_score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:n]


# 📊 Cell 6: Try Recommendation Example

In [12]:
# --- Step 5: Example Top-N Recommendation ---

learner_id = df['learner_id'].iloc[0]
rated_trainers = df[df['learner_id'] == learner_id]['trainer_id'].tolist()
all_trainers = df['trainer_id'].unique().tolist()
sentiment_dict = df.groupby('trainer_id')['sentiment_norm'].mean().to_dict()

recommendations = get_top_n_hybrid(algo, learner_id, all_trainers, rated_trainers, sentiment_dict)

print(f"\nTop 5 trainer recommendations for Learner {learner_id}:")
for trainer, score in recommendations:
    print(f"Trainer: {trainer}, Hybrid Score: {score:.2f}")



Top 5 trainer recommendations for Learner 1:
Trainer: USA, Hybrid Score: 4.58
Trainer: France, Hybrid Score: 4.58
Trainer: Germany, Hybrid Score: 4.58
Trainer: Canada, Hybrid Score: 4.58


# 📎 Cell 7: Save Model Output

In [14]:

top_recs_df = pd.DataFrame(recommendations, columns=["trainer_id", "predicted_rating"])
top_recs_df.to_csv(f"/kaggle/working/recommendations_{learner_id}.csv", index=False)
print(f"📁 Saved recommendations for {learner_id}")


📁 Saved recommendations for 1
