In [1]:
# Model Training Notebook for Tech Book Recommender
# Save this as: notebooks/model_training.ipynb

# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
import pickle
import os
import sys

# Add parent directory to path
sys.path.append('..')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [3]:
# Cell 2: Load Data
books_data = {
    'book_id': range(1, 26),
    'title': [
        "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow",
        "Deep Learning", "Python Machine Learning", 
        "Reinforcement Learning: An Introduction",
        "Pattern Recognition and Machine Learning",
        "Deep Learning with Python", "The Hundred-Page Machine Learning Book",
        "Python for Data Analysis", "Introduction to Statistical Learning",
        "Natural Language Processing with Python",
        "Computer Vision: Algorithms and Applications",
        "Designing Machine Learning Systems", "Grokking Deep Learning",
        "Data Science from Scratch", "Algorithms",
        "Deep Reinforcement Learning Hands-On", "Fluent Python",
        "Speech and Language Processing", "Machine Learning Engineering",
        "Probabilistic Machine Learning: An Introduction",
        "Deep Learning for Computer Vision", "Python Data Science Handbook",
        "Introduction to Algorithms", "Effective Python",
        "Neural Networks and Deep Learning"
    ],
    'author': [
        "Aurélien Géron", "Ian Goodfellow, Yoshua Bengio, Aaron Courville",
        "Sebastian Raschka", "Richard S. Sutton, Andrew G. Barto",
        "Christopher M. Bishop", "François Chollet", "Andriy Burkov",
        "Wes McKinney", "Gareth James, Daniela Witten", "Steven Bird, Ewan Klein",
        "Richard Szeliski", "Chip Huyen", "Andrew Trask", "Joel Grus",
        "Robert Sedgewick, Kevin Wayne", "Maxim Lapan", "Luciano Ramalho",
        "Dan Jurafsky, James H. Martin", "Andriy Burkov", "Kevin Murphy",
        "Rajalingappaa Shanmugamani", "Jake VanderPlas", "Thomas H. Cormen",
        "Brett Slatkin", "Michael Nielsen"
    ],
    'category': [
        "Machine Learning", "Deep Learning", "Machine Learning", 
        "Reinforcement Learning", "Machine Learning", "Deep Learning",
        "Machine Learning", "Data Science", "Machine Learning", "NLP",
        "Computer Vision", "MLOps", "Deep Learning", "Data Science", "Algorithms",
        "Reinforcement Learning", "Python", "NLP", "MLOps", "Machine Learning",
        "Computer Vision", "Data Science", "Algorithms", "Python", "Deep Learning"
    ],
    'level': [
        "Intermediate", "Advanced", "Intermediate", "Advanced", "Advanced",
        "Beginner", "Beginner", "Beginner", "Intermediate", "Intermediate",
        "Advanced", "Intermediate", "Beginner", "Beginner", "Intermediate",
        "Intermediate", "Intermediate", "Advanced", "Intermediate", "Advanced",
        "Intermediate", "Intermediate", "Advanced", "Intermediate", "Beginner"
    ],
    'rating': [4.6, 4.5, 4.4, 4.7, 4.6, 4.5, 4.3, 4.4, 4.6, 4.2, 4.5, 4.7, 
               4.4, 4.3, 4.5, 4.4, 4.7, 4.6, 4.5, 4.6, 4.3, 4.5, 4.5, 4.5, 4.7],
    'year': [2022, 2016, 2019, 2018, 2006, 2021, 2019, 2022, 2021, 2009, 
             2022, 2022, 2019, 2019, 2011, 2020, 2022, 2023, 2020, 2022, 
             2018, 2016, 2009, 2019, 2015]
}

df = pd.DataFrame(books_data)
print(f"✓ Loaded {len(df)} books")
print(df.head())

✓ Loaded 25 books
   book_id                                              title  \
0        1  Hands-On Machine Learning with Scikit-Learn, K...   
1        2                                      Deep Learning   
2        3                            Python Machine Learning   
3        4            Reinforcement Learning: An Introduction   
4        5           Pattern Recognition and Machine Learning   

                                           author                category  \
0                                  Aurélien Géron        Machine Learning   
1  Ian Goodfellow, Yoshua Bengio, Aaron Courville           Deep Learning   
2                               Sebastian Raschka        Machine Learning   
3              Richard S. Sutton, Andrew G. Barto  Reinforcement Learning   
4                           Christopher M. Bishop        Machine Learning   

          level  rating  year  
0  Intermediate     4.6  2022  
1      Advanced     4.5  2016  
2  Intermediate     4.4  2019  


In [4]:
# Cell 3: Feature Engineering
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

# Create content features
df['content'] = df['title'] + ' ' + df['category'] + ' ' + df['level'] + ' ' + df['author']

# Encode categorical variables
le_category = LabelEncoder()
le_level = LabelEncoder()

df['category_encoded'] = le_category.fit_transform(df['category'])
df['level_encoded'] = le_level.fit_transform(df['level'])

# Normalize year
df['year_normalized'] = (df['year'] - df['year'].min()) / (df['year'].max() - df['year'].min())

print("✓ Features engineered")
print(df[['book_id', 'title', 'category_encoded', 'level_encoded', 'year_normalized']].head())


FEATURE ENGINEERING
✓ Features engineered
   book_id                                              title  \
0        1  Hands-On Machine Learning with Scikit-Learn, K...   
1        2                                      Deep Learning   
2        3                            Python Machine Learning   
3        4            Reinforcement Learning: An Introduction   
4        5           Pattern Recognition and Machine Learning   

   category_encoded  level_encoded  year_normalized  
0                 5              2         0.941176  
1                 3              0         0.588235  
2                 5              2         0.764706  
3                 8              0         0.705882  
4                 5              0         0.000000  


In [5]:
# Cell 4: Train Content-Based Model
print("\n" + "="*60)
print("TRAINING CONTENT-BASED MODEL")
print("="*60)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=100)
tfidf_matrix = tfidf.fit_transform(df['content'])

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Cosine Similarity Matrix Shape: {cosine_sim.shape}")

# Test the model
def get_content_recommendations(book_id, n=5):
    idx = df[df['book_id'] == book_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Exclude the book itself
    book_indices = [i[0] for i in sim_scores]
    return df.iloc[book_indices][['book_id', 'title', 'category', 'rating']]

# Test
test_book_id = 1
print(f"\nRecommendations for book ID {test_book_id} ({df[df['book_id']==test_book_id]['title'].values[0]}):")
print(get_content_recommendations(test_book_id, n=3))

print("\n✓ Content-based model trained")


TRAINING CONTENT-BASED MODEL
TF-IDF Matrix Shape: (25, 100)
Number of features: 100
Cosine Similarity Matrix Shape: (25, 25)

Recommendations for book ID 1 (Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow):
    book_id                                            title  \
2         3                          Python Machine Learning   
19       20  Probabilistic Machine Learning: An Introduction   
6         7           The Hundred-Page Machine Learning Book   

            category  rating  
2   Machine Learning     4.4  
19  Machine Learning     4.6  
6   Machine Learning     4.3  

✓ Content-based model trained


In [6]:
# Cell 5: Train KNN Model
print("\n" + "="*60)
print("TRAINING KNN MODEL")
print("="*60)

# Create feature matrix
feature_matrix = df[['category_encoded', 'level_encoded', 'rating', 'year_normalized']].values

# Train KNN
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(feature_matrix)

print(f"Feature Matrix Shape: {feature_matrix.shape}")

# Test KNN
def get_knn_recommendations(book_id, n=5):
    idx = df[df['book_id'] == book_id].index[0]
    book_features = feature_matrix[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(book_features, n_neighbors=n+1)
    similar_indices = indices[0][1:]  # Exclude the book itself
    return df.iloc[similar_indices][['book_id', 'title', 'category', 'rating']]

# Test
print(f"\nKNN Recommendations for book ID {test_book_id}:")
print(get_knn_recommendations(test_book_id, n=3))

print("\n✓ KNN model trained")


TRAINING KNN MODEL
Feature Matrix Shape: (25, 4)

KNN Recommendations for book ID 1:
    book_id                                 title          category  rating
8         9  Introduction to Statistical Learning  Machine Learning     4.6
2         3               Python Machine Learning  Machine Learning     4.4
18       19          Machine Learning Engineering             MLOps     4.5

✓ KNN model trained


In [7]:
# Cell 6: Hybrid Recommendation Function
print("\n" + "="*60)
print("CREATING HYBRID RECOMMENDER")
print("="*60)

def hybrid_recommend(user_ratings, n=6):
    """
    Hybrid recommendation combining content-based and KNN
    user_ratings: dict {book_id: rating}
    """
    # Get highly rated books
    liked_books = [book_id for book_id, rating in user_ratings.items() if rating >= 4]
    
    if not liked_books:
        # Return popular books if no high ratings
        return df.nlargest(n, 'rating')[['book_id', 'title', 'category', 'rating']]
    
    # Get content-based recommendations
    content_recs = []
    for book_id in liked_books:
        content_recs.extend(get_content_recommendations(book_id, n=5)['book_id'].tolist())
    
    # Get KNN recommendations
    knn_recs = []
    for book_id in liked_books[:3]:  # Use top 3 liked books
        knn_recs.extend(get_knn_recommendations(book_id, n=5)['book_id'].tolist())
    
    # Combine and score
    all_recs = {}
    for book_id in content_recs:
        all_recs[book_id] = all_recs.get(book_id, 0) + 0.5  # Content weight
    
    for book_id in knn_recs:
        all_recs[book_id] = all_recs.get(book_id, 0) + 0.3  # KNN weight
    
    # Add popularity score
    for book_id in all_recs.keys():
        book = df[df['book_id'] == book_id]
        if not book.empty:
            all_recs[book_id] += book.iloc[0]['rating'] * 0.2
    
    # Remove already rated books
    for book_id in user_ratings.keys():
        all_recs.pop(book_id, None)
    
    # Sort and return top N
    sorted_recs = sorted(all_recs.items(), key=lambda x: x[1], reverse=True)[:n]
    rec_ids = [book_id for book_id, _ in sorted_recs]
    
    return df[df['book_id'].isin(rec_ids)][['book_id', 'title', 'category', 'rating']]

# Test hybrid model
test_ratings = {1: 5, 3: 4, 6: 5}
print(f"\nHybrid Recommendations for user ratings: {test_ratings}")
print(hybrid_recommend(test_ratings, n=5))

print("\n✓ Hybrid recommender created")


CREATING HYBRID RECOMMENDER

Hybrid Recommendations for user ratings: {1: 5, 3: 4, 6: 5}
    book_id                                            title  \
4         5         Pattern Recognition and Machine Learning   
6         7           The Hundred-Page Machine Learning Book   
8         9             Introduction to Statistical Learning   
15       16             Deep Reinforcement Learning Hands-On   
19       20  Probabilistic Machine Learning: An Introduction   

                  category  rating  
4         Machine Learning     4.6  
6         Machine Learning     4.3  
8         Machine Learning     4.6  
15  Reinforcement Learning     4.4  
19        Machine Learning     4.6  

✓ Hybrid recommender created


In [8]:
# Cell 7: Model Evaluation
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

# Simulate user ratings for testing
np.random.seed(42)
test_users = 5
evaluation_results = []

for user_id in range(test_users):
    # Random user ratings
    num_ratings = np.random.randint(3, 8)
    rated_books = np.random.choice(df['book_id'].values, size=num_ratings, replace=False)
    user_ratings = {book_id: np.random.randint(3, 6) for book_id in rated_books}
    
    # Get recommendations
    recommendations = hybrid_recommend(user_ratings, n=5)
    
    # Calculate average rating of recommendations
    avg_rec_rating = recommendations['rating'].mean()
    
    evaluation_results.append({
        'user_id': user_id,
        'num_ratings': num_ratings,
        'avg_user_rating': np.mean(list(user_ratings.values())),
        'avg_rec_rating': avg_rec_rating
    })

eval_df = pd.DataFrame(evaluation_results)
print(eval_df)
print(f"\nAverage Recommendation Quality: {eval_df['avg_rec_rating'].mean():.2f}")

print("\n✓ Model evaluation complete")


MODEL EVALUATION
   user_id  num_ratings  avg_user_rating  avg_rec_rating
0        0            6         4.000000            4.44
1        1            4         4.500000            4.44
2        2            3         3.000000            4.68
3        3            7         4.428571            4.54
4        4            5         4.200000            4.54

Average Recommendation Quality: 4.53

✓ Model evaluation complete


In [9]:
# Cell 8: Save Models
print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save content-based model
content_model = {
    'tfidf': tfidf,
    'tfidf_matrix': tfidf_matrix,
    'cosine_sim': cosine_sim,
    'books_df': df
}

with open('../models/content_based_model.pkl', 'wb') as f:
    pickle.dump(content_model, f)
print("✓ Content-based model saved")

# Save KNN model
knn_model = {
    'knn': knn,
    'feature_matrix': feature_matrix,
    'books_df': df,
    'le_category': le_category,
    'le_level': le_level
}

with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump(knn_model, f)
print("✓ KNN model saved")

# Save encoders
with open('../models/label_encoders.pkl', 'wb') as f:
    pickle.dump({
        'category': le_category,
        'level': le_level
    }, f)
print("✓ Label encoders saved")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)
print("\nSaved files:")
print("  - models/content_based_model.pkl")
print("  - models/knn_model.pkl")
print("  - models/label_encoders.pkl")


SAVING MODELS
✓ Content-based model saved
✓ KNN model saved
✓ Label encoders saved

MODEL TRAINING COMPLETE!

Saved files:
  - models/content_based_model.pkl
  - models/knn_model.pkl
  - models/label_encoders.pkl
