In [None]:
import pandas as pd

In [None]:
# URL of the raw dataset file on GitHub
url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"

# Load the data into a DataFrame
data = pd.read_json(url)

# Display the first few rows to verify
print(data.head())


   user_id country         city state  age  \
0        1     USA  Los Angeles    CA   30   
1        2     USA     New York    NY   30   
2        3     USA  Los Angeles    CA   18   
3        4  Canada      Toronto    ON   40   
4        5  Canada    Vancouver    BC   18   

                                    video_link      genre  watched  liked  \
0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   

   skipped  
0     True  
1     True  
2     True  
3     True  
4     True  


In [None]:
# URL of the video catalog file on GitHub

url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"
video_catalog = pd.read_json(url)

# Display the first few rows to verify
print(video_catalog.head())


   video_id                                   video_link      genre  country  \
0         1  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      USA   
1         2  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      USA   
2         3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical   Canada   
3         4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop   Canada   
4         5   https://www.youtube.com/shorts/kF0MRowRcIM    African  Nigeria   

          city age_group  
0  Los Angeles     18-35  
1     New York     18-35  
2      Toronto     35-50  
3    Vancouver     18-25  
4         Kano     35-50  


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
from datetime import datetime

# Configure logging for errors only
logging.basicConfig(filename="recommendation_system.log", level=logging.ERROR)

# Define URL parameters for flexibility
USER_DATA_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
VIDEO_CATALOG_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/main/dataset/Video_catalog.json"

# Load data with error handling and retry mechanism
def load_data(url):
    try:
        return pd.read_json(url)
    except Exception as e:
        logging.error(f"Error loading data from {url}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if loading fails

user_data = load_data(USER_DATA_URL)
video_catalog = load_data(VIDEO_CATALOG_URL)

# Recommendation model weights
SVD_WEIGHT = 0.6
LIGHTFM_WEIGHT = 0.4

# Collaborative filtering with Surprise SVD model
reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)

# Predict and calculate RMSE
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
logging.info(f"SVD Model RMSE: {rmse}")

# Encode video features for LightFM model
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['user_id'] = user_data['user_id'].astype(str)
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])

# Initialize LightFM dataset and build combined item features
lfm_dataset = LightFMDataset()
lfm_dataset.fit(users=(x for x in user_data['user_id'].unique()),
                items=(x for x in video_catalog['video_id'].unique()),
                item_features=(x for x in video_catalog['genre']))

# Extract features with optimized apply function
def extract_item_features(df, feature_cols):
    return list(zip(df['video_id'], df[feature_cols].apply(lambda x: list(map(str, x)), axis=1)))

feature_columns = ['genre', 'age', 'city', 'state']
item_features = lfm_dataset.build_item_features(extract_item_features(video_catalog, feature_columns))

# LightFM model training with reduced epochs
lfm_model = LightFM(loss='warp')
(interactions, weights) = lfm_dataset.build_interactions(
    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
)
lfm_model.fit(interactions, item_features=item_features, epochs=15, num_threads=4)

# Precision@5 for LightFM
precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
logging.info(f"LightFM Precision@5: {precision}")

# Time decay function to adjust ratings
def time_decay(timestamp, decay_rate=0.001):
    days_ago = (datetime.now() - datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Calculate engagement thresholds
MIN_INTERACTIONS_FOR_ACTIVE = int(user_data.groupby('user_id').size().quantile(0.5))
HIGH_WATCH_THRESHOLD = user_data['watched'].quantile(0.75)
LOW_WATCH_THRESHOLD = user_data['watched'].quantile(0.25)

# Engagement classifier
user_interactions_cache = user_data.groupby('user_id').apply(lambda x: {
    'total_interactions': len(x),
    'avg_watch': x['watched'].mean(),
    'total_skipped': x['skipped'].sum()
}).to_dict()

def classify_user_engagement(user_id):
    user_metrics = user_interactions_cache.get(user_id, {})
    total_interactions = user_metrics.get('total_interactions', 0)
    avg_watch = user_metrics.get('avg_watch', 0)
    total_skipped = user_metrics.get('total_skipped', 0)

    if total_interactions < MIN_INTERACTIONS_FOR_ACTIVE:
        return "new_user"
    elif avg_watch < LOW_WATCH_THRESHOLD and total_skipped >= total_interactions / 2:
        return "low_engagement_user"
    else:
        return "active_user"

user_profiles = {classify_user_engagement(user_id): user_id for user_id in user_data['user_id'].unique()}

# Generate recommendations
def get_recommendations(user_id, n_recommendations=5, svd_weight=SVD_WEIGHT, lightfm_weight=LIGHTFM_WEIGHT):
    combined_scores = {}
    unwatched_videos = video_catalog[~video_catalog['video_link'].isin(user_data[user_data['user_id'] == user_id]['video_link'])]

    for video in unwatched_videos['video_link']:
        try:
            est_rating = svd_model.predict(user_id, video).est
            combined_scores[video] = combined_scores.get(video, 0) + est_rating * svd_weight
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")

    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        for i, score in enumerate(scores):
            video = video_catalog.iloc[i]['video_link']
            combined_scores[video] = combined_scores.get(video, 0) + score * lightfm_weight

    recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return [rec[0] for rec in recommendations]

unique_user_ids = user_data['user_id'].unique().tolist()
user_input = input(f"Enter a user ID from the following options: {', '.join(unique_user_ids)} for recommendations: ")

if user_input in unique_user_ids:
    recommendations = get_recommendations(user_input, n_recommendations=5)
    print(f"Recommendations for user {user_input}:", recommendations)
else:
    print("Invalid user ID. Please enter a valid user ID from the list:", unique_user_ids)



In [1]:
# Install both 'surprise' and 'lightfm' libraries in Google Colab
!pip install scikit-surprise lightfm


import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
from datetime import datetime

# Configure logging for errors only
logging.basicConfig(filename="recommendation_system.log", level=logging.ERROR)


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise, lightfm
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?

In [11]:
# Define URL parameters for flexibility
USER_DATA_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
VIDEO_CATALOG_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"

# Load data with error handling and retry mechanism
def load_data(url):
    try:
        logging.info(f"Loading data from {url}.")
        return pd.read_json(url)
    except Exception as e:
        logging.error(f"Error loading data from {url}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if loading fails

user_data = load_data(USER_DATA_URL)
video_catalog = load_data(VIDEO_CATALOG_URL)
user_data.head(), video_catalog.head()  # Check the first few rows to verify loading


(   user_id country         city state  age  \
 0        1     USA  Los Angeles    CA   30   
 1        2     USA     New York    NY   30   
 2        3     USA  Los Angeles    CA   18   
 3        4  Canada      Toronto    ON   40   
 4        5  Canada    Vancouver    BC   18   
 
                                     video_link      genre  watched  liked  \
 0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
 1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
 2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
 3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
 4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   
 
    skipped           timestamp  
 0     True 2024-10-30 08:00:00  
 1     True 2024-10-30 08:02:00  
 2     True 2024-10-30 08:04:00  
 3     True 2024-10-30 08:06:00  
 4     True 2024-10-30 08:08:00  ,
    video_id                             

In [3]:
'''
# Set up the collaborative filtering model with Surprise (SVD-based)
reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)

# Train the SVD model
svd_model = SVD()
svd_model.fit(trainset)

# Predict on the test set and calculate RMSE
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
logging.info(f"SVD Model RMSE: {rmse}")
print(f"SVD Model RMSE: {rmse}")

# rating are binary: 0 for "not liked" and 1 for "liked".
# an RMSE closer to 0 (like 0.4764 here) still indicates that the model is effectively predicting ratings in line with actual user preferences
# can be small but not; dataset is small or lacks diversity in user interactions
'''

RMSE: 0.4764
SVD Model RMSE: 0.47639078699546056


In [12]:
# option 2 of above
# Set up the collaborative filtering model with Surprise (SVD-based)
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)

# Define the parameter grid for GridSearch
param_grid = {
    'n_factors': [100, 200, 250, 300],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.002, 0.003, 0.005],
    'reg_all': [0.15, 0.18, 0.2]  # Increased regularization values
}

# Initialize GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(surprise_data)

# Retrieve the best parameters and RMSE score from the grid search
best_params = grid_search.best_params['rmse']
best_rmse = grid_search.best_score['rmse']
logging.info(f"Best RMSE Score from Grid Search: {best_rmse}")
logging.info(f"Best Hyperparameters: {best_params}")
print(f"Best RMSE Score from Grid Search: {best_rmse}")
print(f"Best Hyperparameters: {best_params}")

# Train the optimized SVD model using cross-validation
logging.info("Evaluating the optimized model with cross-validation.")
final_model = SVD(**best_params)
cross_val_results = cross_validate(final_model, surprise_data, measures=['rmse'], cv=10, verbose=True)
average_rmse = cross_val_results['test_rmse'].mean()
print(f"Average RMSE with Cross-Validation: {average_rmse}")
logging.info(f"Average RMSE with Cross-Validation: {average_rmse}")

# Optional: If final evaluation on a test set is desired
trainset, testset = train_test_split(surprise_data, test_size=0.2)
final_model.fit(trainset)
predictions = final_model.test(testset)
test_rmse = accuracy.rmse(predictions)
logging.info(f"Optimized SVD Model Test RMSE: {test_rmse}")
print(f"Optimized SVD Model Test RMSE: {test_rmse}")

Best RMSE Score from Grid Search: 0.4627438780679937
Best Hyperparameters: {'n_factors': 250, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.18}
Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.6108  0.4794  0.4164  0.5463  0.6602  0.4738  0.3217  0.2713  0.4744  0.5490  0.4803  0.1147  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Average RMSE with Cross-Validation: 0.48033054904706923
RMSE: 0.5141
Optimized SVD Model Test RMSE: 0.5140954270425315


In [19]:
# Ensure unique video_link values in video_catalog before setting index
video_catalog = video_catalog.drop_duplicates(subset=['video_link'])

# Encode video features for LightFM model
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['user_id'] = user_data['user_id'].astype(str)
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])

# Initialize LightFM dataset with all item feature types
lfm_dataset = LightFMDataset()
lfm_dataset.fit(
    users=(x for x in user_data['user_id'].unique()),
    items=(x for x in video_catalog['video_id'].unique()),
    item_features=(x for x in pd.concat([video_catalog['genre'], video_catalog['age_group'],
                                         video_catalog['city'], video_catalog['country']]).unique())
)

# Define feature extraction function and generate item features
def extract_item_features(df, feature_cols):
    return list(zip(df['video_id'], df[feature_cols].apply(lambda x: list(map(str, x)), axis=1)))

# Define the feature columns based on the actual data structure in video_catalog
feature_columns = ['genre', 'age_group', 'city', 'country']
item_features = lfm_dataset.build_item_features(extract_item_features(video_catalog, feature_columns))

# Define the range of parameters to test for LightFM - BASELINE
loss_functions = ['warp', 'bpr', 'warp-kos']
epochs = [15, 30] # adjust epochs later
learning_rates = [0.01, 0.05]
embedding_sizes = [30, 50]  # no_components # adjust epochs later

# Initialize variables to store best results
best_precision = 0
best_params = {}

# Loop through each combination of parameters
for loss in loss_functions:
    for epoch in epochs:
        for lr in learning_rates:
            for n_components in embedding_sizes:
                # Train LightFM model with the current set of parameters
                lfm_model = LightFM(loss=loss, learning_rate=lr, no_components=n_components)
                (interactions, weights) = lfm_dataset.build_interactions(
                    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
                )
                lfm_model.fit(interactions, item_features=item_features, epochs=epoch, num_threads=4)

                # Calculate Precision@5 for LightFM
                precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
                logging.info(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, Precision: {precision}")
                print(f"Loss Function: {loss}, Epochs: {epoch}, Learning Rate: {lr}, Components: {n_components}, Precision: {precision}")

                # Update best precision and parameters if current precision is higher
                if precision > best_precision:
                    best_precision = precision
                    best_params = {'loss': loss, 'epochs': epoch, 'learning_rate': lr, 'no_components': n_components}

# Display the best parameters and corresponding precision score
print(f"\nBest Precision: {best_precision} with parameters: {best_params}")
logging.info(f"Best Precision: {best_precision} with parameters: {best_params}")


'''
Result: Best Precision: 0.8285714387893677 with parameters: {'loss': 'warp-kos', 'epochs': 30, 'learning_rate': 0.05, 'no_components': 30}
0.83 means that in the top 5 recommendations, about 83% of the items are relevant
'''

Loss Function: warp, Epochs: 15, Learning Rate: 0.01, Components: 30, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 15, Learning Rate: 0.01, Components: 50, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 15, Learning Rate: 0.05, Components: 30, Precision: 0.5428571701049805
Loss Function: warp, Epochs: 15, Learning Rate: 0.05, Components: 50, Precision: 0.5142857432365417
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 30, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.01, Components: 50, Precision: 0.3142857253551483
Loss Function: warp, Epochs: 30, Learning Rate: 0.05, Components: 30, Precision: 0.7142857313156128
Loss Function: warp, Epochs: 30, Learning Rate: 0.05, Components: 50, Precision: 0.6571429371833801
Loss Function: bpr, Epochs: 15, Learning Rate: 0.01, Components: 30, Precision: 0.2857142984867096
Loss Function: bpr, Epochs: 15, Learning Rate: 0.01, Components: 50, Precision: 0.2857142984867096
Lo

"\nResult: Best Precision: 0.8285714387893677 with parameters: {'loss': 'warp-kos', 'epochs': 30}\n0.83 means that in the top 5 recommendations, about 83% of the items are relevant\n"

In [None]:
# Time decay function to adjust ratings based on recency of interaction
def time_decay(timestamp, decay_rate=0.001):
    days_ago = (datetime.now() - datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Calculate engagement thresholds based on data
MIN_INTERACTIONS_FOR_ACTIVE = int(user_data.groupby('user_id').size().quantile(0.5))
HIGH_WATCH_THRESHOLD = user_data['watched'].quantile(0.75)
LOW_WATCH_THRESHOLD = user_data['watched'].quantile(0.25)

print("Engagement thresholds set:", MIN_INTERACTIONS_FOR_ACTIVE, HIGH_WATCH_THRESHOLD, LOW_WATCH_THRESHOLD)


In [None]:
# Cache user interaction metrics for engagement classification
user_interactions_cache = user_data.groupby('user_id').apply(lambda x: {
    'total_interactions': len(x),
    'avg_watch': x['watched'].mean(),
    'total_skipped': x['skipped'].sum()
}).to_dict()

# Classify engagement
def classify_user_engagement(user_id):
    user_metrics = user_interactions_cache.get(user_id, {})
    total_interactions = user_metrics.get('total_interactions', 0)
    avg_watch = user_metrics.get('avg_watch', 0)
    total_skipped = user_metrics.get('total_skipped', 0)

    if total_interactions < MIN_INTERACTIONS_FOR_ACTIVE:
        return "new_user"
    elif avg_watch < LOW_WATCH_THRESHOLD and total_skipped >= total_interactions / 2:
        return "low_engagement_user"
    else:
        return "active_user"

# Assign profiles to users
user_profiles = {classify_user_engagement(user_id): user_id for user_id in user_data['user_id'].unique()}
print("User profiles assigned:", user_profiles)


In [None]:
# Generate blended recommendations
def get_recommendations(user_id, n_recommendations=5, svd_weight=SVD_WEIGHT, lightfm_weight=LIGHTFM_WEIGHT):
    combined_scores = {}
    unwatched_videos = video_catalog[~video_catalog['video_link'].isin(user_data[user_data['user_id'] == user_id]['video_link'])]

    for video in unwatched_videos['video_link']:
        try:
            est_rating = svd_model.predict(user_id, video).est
            combined_scores[video] = combined_scores.get(video, 0) + est_rating * svd_weight
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")

    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        for i, score in enumerate(scores):
            video = video_catalog.iloc[i]['video_link']
            combined_scores[video] = combined_scores.get(video, 0) + score * lightfm_weight

    recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return [rec[0] for rec in recommendations]

# Test recommendations by input
unique_user_ids = user_data['user_id'].unique().tolist()
user_input = input(f"Enter a user ID from the following options: {', '.join(unique_user_ids)} for recommendations: ")

if user_input in unique_user_ids:
    recommendations = get_recommendations(user_input, n_recommendations=5)
    print(f"Recommendations for user {user_input}:", recommendations)
else:
    print("Invalid user ID. Please enter a valid user ID from the list:", unique_user_ids)
