<a href="https://colab.research.google.com/github/javeriaz15/Video-Recommendation/blob/main/VRS_R1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install both 'surprise' and 'lightfm' libraries in Google Colab
!pip install scikit-surprise lightfm


import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
import json
from datetime import datetime #??where

# Configure logging for errors only
logging.basicConfig(filename="recommendation_system.log", level=logging.ERROR)

# Define weights for SVD and LightFM recommendations
SVD_WEIGHT = 0.6
LIGHTFM_WEIGHT = 0.4

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise, lightfm
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?

In [3]:
# Define URL parameters for flexibility
USER_DATA_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
VIDEO_CATALOG_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"

# Load data with error handling and retry mechanism
def load_data(url):
    try:
        logging.info(f"Loading data from {url}.")
        return pd.read_json(url)
    except Exception as e:
        logging.error(f"Error loading data from {url}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if loading fails

user_data = load_data(USER_DATA_URL)
video_catalog = load_data(VIDEO_CATALOG_URL)
user_data.head(), video_catalog.head()  # Check the first few rows to verify loading


(   user_id country         city state  age  \
 0        1     USA  Los Angeles    CA   30   
 1        2     USA     New York    NY   30   
 2        3     USA  Los Angeles    CA   18   
 3        4  Canada      Toronto    ON   40   
 4        5  Canada    Vancouver    BC   18   
 
                                     video_link      genre  watched  liked  \
 0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
 1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
 2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
 3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
 4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   
 
    skipped           timestamp  
 0     True 2024-10-30 08:00:00  
 1     True 2024-10-30 08:02:00  
 2     True 2024-10-30 08:04:00  
 3     True 2024-10-30 08:06:00  
 4     True 2024-10-30 08:08:00  ,
    video_id                             

In [3]:
'''
# Set up the collaborative filtering model with Surprise (SVD-based)
reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)

# Train the SVD model
svd_model = SVD()
svd_model.fit(trainset)

# Predict on the test set and calculate RMSE
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
logging.info(f"SVD Model RMSE: {rmse}")
print(f"SVD Model RMSE: {rmse}")

# rating are binary: 0 for "not liked" and 1 for "liked".
# an RMSE closer to 0 (like 0.4764 here) still indicates that the model is effectively predicting ratings in line with actual user preferences
# can be small but not; dataset is small or lacks diversity in user interactions
'''

RMSE: 0.4764
SVD Model RMSE: 0.47639078699546056


In [4]:
# option 2 of above
# Set up the collaborative filtering model with Surprise (SVD-based)
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)

# Define the parameter grid for GridSearch
param_grid = {
    'n_factors': [100, 200, 250, 300],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.002, 0.003, 0.005],
    'reg_all': [0.15, 0.18, 0.2]  # Increased regularization values
}

# Initialize GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(surprise_data)

# Retrieve the best parameters and RMSE score from the grid search
best_params = grid_search.best_params['rmse']
best_rmse = grid_search.best_score['rmse']
logging.info(f"Best RMSE Score from Grid Search: {best_rmse}")
logging.info(f"Best Hyperparameters: {best_params}")
print(f"Best RMSE Score from Grid Search: {best_rmse}")
print(f"Best Hyperparameters: {best_params}")

# Train the optimized SVD model using cross-validation
logging.info("Evaluating the optimized model with cross-validation.")
final_model = SVD(**best_params)
cross_val_results = cross_validate(final_model, surprise_data, measures=['rmse'], cv=10, verbose=True)
average_rmse = cross_val_results['test_rmse'].mean()
print(f"Average RMSE with Cross-Validation: {average_rmse}")
logging.info(f"Average RMSE with Cross-Validation: {average_rmse}")

# Optional: If final evaluation on a test set is desired
trainset, testset = train_test_split(surprise_data, test_size=0.2)
final_model.fit(trainset)
predictions = final_model.test(testset)
test_rmse = accuracy.rmse(predictions)
logging.info(f"Optimized SVD Model Test RMSE: {test_rmse}")
print(f"Optimized SVD Model Test RMSE: {test_rmse}")

Best RMSE Score from Grid Search: 0.46871320542068573
Best Hyperparameters: {'n_factors': 300, 'n_epochs': 30, 'lr_all': 0.002, 'reg_all': 0.18}
Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.3375  0.6798  0.5940  0.5380  0.5668  0.4296  0.4181  0.4666  0.6272  0.4879  0.5146  0.1002  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Average RMSE with Cross-Validation: 0.514559092519934
RMSE: 0.5452
Optimized SVD Model Test RMSE: 0.5452492420656216


In [5]:
#LIGHTFM OPTION 1
# Ensure unique video_link values in video_catalog before setting index

'''
video_catalog = video_catalog.drop_duplicates(subset=['video_link'])

# Encode video features for LightFM model
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['user_id'] = user_data['user_id'].astype(str)
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])

# Initialize LightFM dataset with all item feature types
lfm_dataset = LightFMDataset()
lfm_dataset.fit(
    users=(x for x in user_data['user_id'].unique()),
    items=(x for x in video_catalog['video_id'].unique()),
    item_features=(x for x in pd.concat([video_catalog['genre'], video_catalog['age_group'],
                                         video_catalog['city'], video_catalog['country']]).unique())
)

# Define feature extraction function and generate item features
def extract_item_features(df, feature_cols):
    return list(zip(df['video_id'], df[feature_cols].apply(lambda x: list(map(str, x)), axis=1)))

# Define the feature columns based on the actual data structure in video_catalog
feature_columns = ['genre', 'age_group', 'city', 'country']
item_features = lfm_dataset.build_item_features(extract_item_features(video_catalog, feature_columns))

# Define the range of parameters to test for LightFM - BASELINE
loss_functions = ['warp', 'bpr', 'warp-kos']
epochs = [15, 30] # adjust epochs later
learning_rates = [0.01, 0.05]
embedding_sizes = [30, 50]  # no_components # adjust epochs later

# Initialize variables to store best results
best_precision = 0
best_params = {}

# Loop through each combination of parameters
for loss in loss_functions:
    for epoch in epochs:
        for lr in learning_rates:
            for n_components in embedding_sizes:
                # Train LightFM model with the current set of parameters
                lfm_model = LightFM(loss=loss, learning_rate=lr, no_components=n_components)
                (interactions, weights) = lfm_dataset.build_interactions(
                    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
                )
                lfm_model.fit(interactions, item_features=item_features, epochs=epoch, num_threads=4)

                # Calculate Precision@5 for LightFM
                precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
                logging.info(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, Precision: {precision}")
                print(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, Precision: {precision}")

                # Update best precision and parameters if current precision is higher
                if precision > best_precision:
                    best_precision = precision
                    best_params = {'loss': loss, 'epochs': epoch, 'learning_rate': lr, 'no_components': n_components}

# Display the best parameters and corresponding precision score
print(f"\nBest Precision: {best_precision} with parameters: {best_params}")
logging.info(f"Best Precision: {best_precision} with parameters: {best_params}")
'''

'''
Result: Best Precision: 0.8285714387893677 with parameters: {'loss': 'warp-kos', 'epochs': 30, 'learning_rate': 0.05, 'no_components': 30}
0.83 means that in the top 5 recommendations, about 83% of the items are relevant
'''

Loss Function: warp, Epochs: 15, Learning Rate: 0.01, Components: 30, Precision: 0.34285715222358704
Loss Function: warp, Epochs: 15, Learning Rate: 0.01, Components: 50, Precision: 0.34285715222358704
Loss Function: warp, Epochs: 15, Learning Rate: 0.05, Components: 30, Precision: 0.5428571701049805
Loss Function: warp, Epochs: 15, Learning Rate: 0.05, Components: 50, Precision: 0.5428571701049805
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, Precision: 0.37142857909202576
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 50, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.05, Components: 30, Precision: 0.6857143044471741
Loss Function: warp, Epochs: 30, Learning Rate: 0.05, Components: 50, Precision: 0.6285714507102966
Loss Function: bpr, Epochs: 15, Learning Rate: 0.01, Components: 30, Precision: 0.2857142984867096
Loss Function: bpr, Epochs: 15, Learning Rate: 0.01, Components: 50, Precision: 0.285714328289032


"\nResult: Best Precision: 0.8285714387893677 with parameters: {'loss': 'warp-kos', 'epochs': 30, 'learning_rate': 0.05, 'no_components': 30}\n0.83 means that in the top 5 recommendations, about 83% of the items are relevant\n"

In [13]:
'''
# LIGHTTFM OPTION 2
# Higher Epochs and More Parameter Tuning
# iterates through epochs of 30, 50, and 100, Higher no_components Values, Expanded learning_rate, Added Regularization (user_alpha and item_alpha) to prevent overfitting

# Define the extended range of parameters for LightFM tuning
loss_functions = ['warp', 'warp-kos']  # warp-gamma can be tried if on GPU
epochs = [30, 50, 100]  # increased epochs for deeper training
learning_rates = [0.01, 0.05, 0.1]  # higher learning rate options
embedding_sizes = [30, 50, 100]  # larger embedding dimensionality

# Additional parameters for regularization
user_alphas = [1e-6, 1e-5, 1e-4]  # user regularization strengths
item_alphas = [1e-6, 1e-5, 1e-4]  # item regularization strengths

# Initialize variables to store best results
best_precision = 0
best_params = {}

# Loop through each combination of parameters
for loss in loss_functions:
    for epoch in epochs:
        for lr in learning_rates:
            for n_components in embedding_sizes:
                for user_alpha in user_alphas:
                    for item_alpha in item_alphas:
                        # Train LightFM model with the current set of parameters
                        lfm_model = LightFM(loss=loss, learning_rate=lr, no_components=n_components,
                                            user_alpha=user_alpha, item_alpha=item_alpha)
                        (interactions, weights) = lfm_dataset.build_interactions(
                            ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
                        )
                        lfm_model.fit(interactions, item_features=item_features, epochs=epoch, num_threads=4)

                        # Calculate Precision@5 for LightFM
                        precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
                        logging.info(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, "
                                     f"User Alpha: {user_alpha}, Item Alpha: {item_alpha}, Precision: {precision}")
                        print(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, "
                              f"User Alpha: {user_alpha}, Item Alpha: {item_alpha}, Precision: {precision}")

                        # Update best precision and parameters if current precision is higher
                        if precision > best_precision:
                            best_precision = precision
                            best_params = {
                                'loss': loss, 'epochs': epoch, 'learning_rate': lr, 'no_components': n_components,
                                'user_alpha': user_alpha, 'item_alpha': item_alpha
                            }

# Display the best parameters and corresponding precision score
print(f"\nBest Precision: {best_precision} with parameters: {best_params}")
logging.info(f"Best Precision: {best_precision} with parameters: {best_params}")
'''

Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 1e-06, Precision: 0.34285715222358704
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 1e-05, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 0.0001, Precision: 0.4000000059604645
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-05, Item Alpha: 1e-06, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-05, Item Alpha: 1e-05, Precision: 0.37142863869667053
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-05, Item Alpha: 0.0001, Precision: 0.34285715222358704
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 0.0001, Item Alpha: 1e-06, Precision: 0.4571428596973419
Loss Function: warp, Epochs:

In [16]:
# LIGHTFM OPTION 3 - Higher Epochs and More Parameter Tuning
# Expanded embedding sizes to 50 and 100, Iterated with 50 and 100 epochs, Adjusted user_alpha and item_alpha with finer increments (e.g., 2e-6 and 5e-6), Expanded learning rate options up to 0.1

# Define the extended range of parameters for LightFM tuning
loss_functions = ['warp', 'warp-kos']  # Additional loss functions for ranking
epochs = [30, 50, 100]  # Increased epochs for deeper training
learning_rates = [0.01, 0.05, 0.1]  # Higher learning rates to explore faster convergence
embedding_sizes = [30, 50, 100]  # Larger dimensionality for embedding, enabling richer feature capture

# Additional parameters for regularization with finer increments
user_alphas = [1e-6, 2e-6, 5e-6, 1e-5]  # Small increments in user regularization to prevent overfitting
item_alphas = [1e-6, 2e-6, 5e-6, 1e-5]  # Small increments in item regularization

# Initialize variables to store the best results
best_precision = 0
best_params = {}

# Loop through each combination of parameters
for loss in loss_functions:
    for epoch in epochs:
        for lr in learning_rates:
            for n_components in embedding_sizes:
                for user_alpha in user_alphas:
                    for item_alpha in item_alphas:
                        # Train LightFM model with the current set of parameters
                        lfm_model = LightFM(
                            loss=loss, learning_rate=lr, no_components=n_components,
                            user_alpha=user_alpha, item_alpha=item_alpha
                        )
                        (interactions, weights) = lfm_dataset.build_interactions(
                            ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
                        )

                        # Additional features may be incorporated here if available, enhancing personalization
                        lfm_model.fit(interactions, item_features=item_features, epochs=epoch, num_threads=4)

                        # Calculate Precision@5 for LightFM
                        precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
                        logging.info(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, "
                                     f"User Alpha: {user_alpha}, Item Alpha: {item_alpha}, Precision: {precision}")
                        print(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, "
                              f"User Alpha: {user_alpha}, Item Alpha: {item_alpha}, Precision: {precision}")

                        # Update best precision and parameters if current precision is higher
                        if precision > best_precision:
                            best_precision = precision
                            best_params = {
                                'loss': loss, 'epochs': epoch, 'learning_rate': lr, 'no_components': n_components,
                                'user_alpha': user_alpha, 'item_alpha': item_alpha
                            }

# Display the best parameters and corresponding precision score
print(f"\nBest Precision: {best_precision} with parameters: {best_params}")
logging.info(f"Best Precision: {best_precision} with parameters: {best_params}")

'''
Result: Best Precision: 0.8857143521308899 with parameters: {'loss': 'warp', 'epochs': 30, 'learning_rate': 0.1, 'no_components': 30, 'user_alpha': 1e-06, 'item_alpha': 1e-06}
0.88 means that in the top 5 recommendations, about 88% of the items are relevant
'''

'''
To do: Incremental Model Updates:
As new data comes in, consider training the model incrementally with recent interactions to keep it updated without fully retraining.
'''

Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 1e-06, Precision: 0.3428571820259094
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 2e-06, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 5e-06, Precision: 0.40000003576278687
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 1e-06, Item Alpha: 1e-05, Precision: 0.34285715222358704
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 2e-06, Item Alpha: 1e-06, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 2e-06, Item Alpha: 2e-06, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, User Alpha: 2e-06, Item Alpha: 5e-06, Precision: 0.40000003576278687
Loss Function: warp, Epochs: 30

In [17]:
# Time decay function to adjust ratings based on recency of interaction
def time_decay(timestamp, decay_rate=0.001):
    # Convert timestamp to string format if it's not already a string
    timestamp_str = timestamp.strftime("%Y-%m-%dT%H:%M:%S") if isinstance(timestamp, pd.Timestamp) else timestamp
    days_ago = (datetime.now() - datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Calculate engagement thresholds based on data
MIN_INTERACTIONS_FOR_ACTIVE = int(user_data.groupby('user_id').size().quantile(0.5))
HIGH_WATCH_THRESHOLD = user_data['watched'].quantile(0.75)
LOW_WATCH_THRESHOLD = user_data['watched'].quantile(0.25)

print("Engagement thresholds set:", MIN_INTERACTIONS_FOR_ACTIVE, HIGH_WATCH_THRESHOLD, LOW_WATCH_THRESHOLD)

'''
Engagement thresholds set (calculated based on the data in user_data): 6 0.8 0.1
MIN_INTERACTIONS_FOR_ACTIVE 6 = the median (50th percentile) of the number of interactions per user, meaning that 50% of the users have 6 or fewer interactions
HIGH_WATCH_THRESHOLD 0.8 = 75th percentile of the watched field - a user who has a watched rate above this value (80% or more) is considered highly engaged
LOW_WATCH_THRESHOLD 0.1 = 25th percentile of the watched field, indicating low engagement- uers with a watched rate of 10% or lower are considered to have low engagement.
'''

Engagement thresholds set: 6 0.8 0.1


'\nEngagement thresholds set (calculated based on the data in user_data): 6 0.8 0.1\nMIN_INTERACTIONS_FOR_ACTIVE 6 = the median (50th percentile) of the number of interactions per user, meaning that 50% of the users have 6 or fewer interactions\nHIGH_WATCH_THRESHOLD 0.8 = 75th percentile of the watched field - a user who has a watched rate above this value (80% or more) is considered highly engaged\nLOW_WATCH_THRESHOLD 0.1 = 25th percentile of the watched field, indicating low engagement- uers with a watched rate of 10% or lower are considered to have low engagement.\n'

In [18]:
# Cache user interaction metrics for engagement classification

user_interactions_cache = user_data.groupby('user_id', group_keys=False).apply(lambda x: {
    'total_interactions': len(x),
    'avg_watch': x['watched'].mean(),
    'total_skipped': x['skipped'].sum()
}).to_dict()

# Classify engagement
def classify_user_engagement(user_id):
    user_metrics = user_interactions_cache.get(user_id, {})
    total_interactions = user_metrics.get('total_interactions', 0)
    avg_watch = user_metrics.get('avg_watch', 0)
    total_skipped = user_metrics.get('total_skipped', 0)

    if total_interactions < MIN_INTERACTIONS_FOR_ACTIVE:
        return "new_user"
    elif avg_watch < LOW_WATCH_THRESHOLD and total_skipped >= total_interactions / 2:
        return "low_engagement_user"
    else:
        return "active_user"

# Assign profiles to users
user_profiles = {classify_user_engagement(user_id): user_id for user_id in user_data['user_id'].unique()}
print("User profiles assigned:", user_profiles)

'''
User with ID '5' is classified as an "active_user."
User with ID '7' is classified as a "low_engagement_user."
User with ID '6' is classified as a "new_user."
'''

User profiles assigned: {'active_user': '5', 'low_engagement_user': '7', 'new_user': '6'}


  user_interactions_cache = user_data.groupby('user_id', group_keys=False).apply(lambda x: {


'\nUser with ID \'5\' is classified as an "active_user."\nUser with ID \'7\' is classified as a "low_engagement_user."\nUser with ID \'6\' is classified as a "new_user."\n'

In [19]:
# OPTION 1 - recommending videos caping only 5 videos in a day for all users,
import json
# Generate blended recommendations for a single user
def get_recommendations(user_id, n_recommendations=5, svd_weight=SVD_WEIGHT, lightfm_weight=LIGHTFM_WEIGHT):
    combined_scores = {}
    unwatched_videos = video_catalog[~video_catalog['video_link'].isin(user_data[user_data['user_id'] == user_id]['video_link'])]

    # Get SVD-based scores
    for video in unwatched_videos['video_link']:
        try:
            est_rating = final_model.predict(user_id, video).est  # Use the optimized model
            combined_scores[video] = combined_scores.get(video, 0) + est_rating * svd_weight
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")

    # Get LightFM-based scores
    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        for i, score in enumerate(scores):
            video = video_catalog.iloc[i]['video_link']
            combined_scores[video] = combined_scores.get(video, 0) + score * lightfm_weight

    # Sort and limit to top recommendations
    recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return [rec[0] for rec in recommendations]

# Generate recommendations for all users and save to JSON
all_recommendations = {}
for user_id in user_data['user_id'].unique():
    all_recommendations[user_id] = get_recommendations(user_id)

# Save recommendations to JSON file
with open('recommendation_cache.json', 'w') as f:
    json.dump(all_recommendations, f, indent=4)

# Display the recommendations in the notebook
# print("Daily Video Recommendations for Each User:")
# for user_id, recommendations in all_recommendations.items():
    # print(f"User {user_id}: {recommendations}")

# Optionally: display JSON format in notebook for verification
print("\nJSON Format of Recommendations:")
print(json.dumps(all_recommendations, indent=4))

# Instructions to push the JSON file to GitHub (use in a Jupyter notebook environment with Git configured):
# !git add recommendation_cache.json
# !git commit -m "Update daily recommendation cache"
# !git push origin main

'''
Future Consideration for Large Datasets
Parallel Processing: For a large user base, consider using batch processing tools like Apache Spark or parallel processing libraries to handle recommendation generation concurrently.
Incremental Updates: If the video catalog frequently updates, a smart caching mechanism could minimize unnecessary recalculations for unchanged items. TO DO....
'''


JSON Format of Recommendations:
{
    "1": [
        "https://www.youtube.com/shorts/HL2pI9G6w0U",
        "https://www.youtube.com/shorts/kF0MRowRcIM",
        "https://www.youtube.com/shorts/MjD8NXuJ9yQ",
        "https://www.youtube.com/shorts/SdUS6f4g_O4",
        "https://www.youtube.com/watch?v=mGWBy_5gBPY"
    ],
    "2": [
        "https://www.youtube.com/watch?v=7iqMNnzQPmY",
        "https://www.youtube.com/watch?v=ziTvx_1xKEg",
        "https://www.youtube.com/watch?v=D4vN_5MBEog",
        "https://www.youtube.com/shorts/BzlWWZ5lhSE",
        "https://www.youtube.com/shorts/t3Hmj_-K4aQ"
    ],
    "3": [
        "https://www.youtube.com/shorts/Utu9EFE3kPI",
        "https://www.youtube.com/shorts/t3Hmj_-K4aQ",
        "https://www.youtube.com/shorts/WCciYRmBJNk",
        "https://www.youtube.com/watch?v=HZCv-KeUDiU",
        "https://www.youtube.com/watch?v=D4vN_5MBEog"
    ],
    "4": [
        "https://www.youtube.com/shorts/F93KNgr9cOo",
        "https://www.youtube.com/

'\nFuture Consideration for Large Datasets\nParallel Processing: For a large user base, consider using batch processing tools like Apache Spark or parallel processing libraries to handle recommendation generation concurrently.\nIncremental Updates: If the video catalog frequently updates, a smart caching mechanism could minimize unnecessary recalculations for unchanged items. TO DO....\n'

In [None]:
# TO DO - OPTION 1 - recommending videos caping only 5 videos in a day,
# with incremental updates (caching previously generated recommendations and only recalculating them for new or updated videos)

In [20]:
# OPTION 2 - recommending videos dynamically adjusts recommendations based on both user engagement level and real-time user activity

# Define weights for SVD and LightFM recommendations
SVD_WEIGHT = 0.6
LIGHTFM_WEIGHT = 0.4

# Function to dynamically adjust the number of recommendations
def get_dynamic_n_recommendations(user_id, user_engagement_profile):
    """Determine the number of recommendations based on user engagement and recent interactions."""
    # Base recommendation count per engagement level
    base_recommendations = {
        "active_user": 10,
        "low_engagement_user": 3,
        "new_user": 5
    }

    # Start with base recommendation count
    n_recommendations = base_recommendations.get(user_engagement_profile, 5)

    # Fetch recent interaction data for the user
    recent_interactions = user_data[user_data['user_id'] == user_id]
    recent_watched_count = len(recent_interactions[recent_interactions['watched'] > 0.5])  # videos with >50% watch time

    # Adjust recommendation count based on user activity
    if recent_watched_count >= n_recommendations * 0.8:  # if 80%+ recommendations watched, increase by 20%
        n_recommendations = int(n_recommendations * 1.2)
    elif recent_watched_count <= n_recommendations * 0.2:  # if <20% watched, decrease by 20%
        n_recommendations = max(3, int(n_recommendations * 0.8))  # Ensure a minimum of 3 recommendations

    # Further increase based on catalog size, ensuring it's proportional to available content
    catalog_factor = min(len(video_catalog) // 10, 10)  # Max increase factor of 10, dependent on catalog size
    n_recommendations = min(n_recommendations + catalog_factor, 20)  # Cap at 20 recommendations

    return n_recommendations

# Generate blended recommendations
def get_recommendations(user_id, svd_weight=SVD_WEIGHT, lightfm_weight=LIGHTFM_WEIGHT):
    # Determine engagement profile and dynamic recommendation count
    user_engagement_profile = classify_user_engagement(user_id)
    n_recommendations = get_dynamic_n_recommendations(user_id, user_engagement_profile)

    combined_scores = {}
    unwatched_videos = video_catalog[~video_catalog['video_link'].isin(user_data[user_data['user_id'] == user_id]['video_link'])]

    # Generate SVD scores
    for video in unwatched_videos['video_link']:
        try:
            est_rating = final_model.predict(user_id, video).est
            combined_scores[video] = combined_scores.get(video, 0) + est_rating * svd_weight
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")

    # Generate LightFM scores
    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        for i, score in enumerate(scores):
            video = video_catalog.iloc[i]['video_link']
            combined_scores[video] = combined_scores.get(video, 0) + score * lightfm_weight

    # Sort and limit to the top N recommendations
    recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return [rec[0] for rec in recommendations]

# Generate recommendations for all users and save to JSON
all_recommendations = {user_id: get_recommendations(user_id) for user_id in user_data['user_id'].unique()}

# Save recommendations to JSON file
json_filename2 = 'recommendation_dynamic_cache.json'
with open(json_filename2, 'w') as f:
    json.dump(all_recommendations, f, indent=4)

# Print JSON file content in the notebook
with open(json_filename2, 'r') as f:
    print("Recommendations dynamic JSON content:\n", f.read())

# Git commands to push the JSON file to GitHub
# (These commands require Git to be configured in the notebook environment)
# !git add recommendation_cache.json
# !git commit -m "Update daily recommendation cache"
# !git push origin main



Recommendations dynamic JSON content:
 {
    "1": [
        "https://www.youtube.com/shorts/HL2pI9G6w0U",
        "https://www.youtube.com/shorts/kF0MRowRcIM",
        "https://www.youtube.com/shorts/MjD8NXuJ9yQ",
        "https://www.youtube.com/shorts/SdUS6f4g_O4",
        "https://www.youtube.com/watch?v=mGWBy_5gBPY",
        "https://www.youtube.com/watch?v=D4vN_5MBEog",
        "https://www.youtube.com/shorts/SbiJoaKttSI",
        "https://www.youtube.com/shorts/WCciYRmBJNk",
        "https://www.youtube.com/shorts/BzlWWZ5lhSE",
        "https://www.youtube.com/shorts/hrLoERAIj9Q"
    ],
    "2": [
        "https://www.youtube.com/watch?v=7iqMNnzQPmY",
        "https://www.youtube.com/watch?v=ziTvx_1xKEg",
        "https://www.youtube.com/watch?v=D4vN_5MBEog",
        "https://www.youtube.com/shorts/BzlWWZ5lhSE",
        "https://www.youtube.com/shorts/t3Hmj_-K4aQ",
        "https://www.youtube.com/shorts/mElZ81wdaLQ",
        "https://www.youtube.com/shorts/SbiJoaKttSI",
        