In [None]:
import pandas as pd

In [None]:
# URL of the raw dataset file on GitHub
url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"

# Load the data into a DataFrame
data = pd.read_json(url)

# Display the first few rows to verify
print(data.head())


   user_id country         city state  age  \
0        1     USA  Los Angeles    CA   30   
1        2     USA     New York    NY   30   
2        3     USA  Los Angeles    CA   18   
3        4  Canada      Toronto    ON   40   
4        5  Canada    Vancouver    BC   18   

                                    video_link      genre  watched  liked  \
0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   

   skipped  
0     True  
1     True  
2     True  
3     True  
4     True  


In [None]:
# URL of the video catalog file on GitHub

url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"
video_catalog = pd.read_json(url)

# Display the first few rows to verify
print(video_catalog.head())


   video_id                                   video_link      genre  country  \
0         1  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      USA   
1         2  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      USA   
2         3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical   Canada   
3         4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop   Canada   
4         5   https://www.youtube.com/shorts/kF0MRowRcIM    African  Nigeria   

          city age_group  
0  Los Angeles     18-35  
1     New York     18-35  
2      Toronto     35-50  
3    Vancouver     18-25  
4         Kano     35-50  


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
from datetime import datetime

# Configure logging for enhanced debugging
logging.basicConfig(filename="recommendation_system.log", level=logging.INFO)

# Define URL parameters for flexibility
USER_DATA_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
VIDEO_CATALOG_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/main/dataset/Video_catalog.json"

# Load the user interaction data and video catalog
logging.info("Loading user interaction data and video catalog.")
user_data = pd.read_json(USER_DATA_URL)
video_catalog = pd.read_json(VIDEO_CATALOG_URL)

# Configure collaborative filtering model with Surprise (SVD-based)
reader = Reader(rating_scale=(0, 1))  # assuming binary (liked or not)
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)

# Split the dataset and train SVD model
logging.info("Training SVD model for collaborative filtering.")
trainset, testset = train_test_split(surprise_data, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)
logging.info("SVD model training completed.")

# Predict on the test set and log RMSE for model performance
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
logging.info(f"SVD Model RMSE: {rmse}")

# Prepare LightFM dataset with combined item features for improved efficiency
logging.info("Encoding video features for LightFM model.")
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['user_id'] = user_data['user_id'].astype(str)
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])

# Initialize LightFM dataset
lfm_dataset = LightFMDataset()
lfm_dataset.fit(users=(x for x in user_data['user_id'].unique()),
                items=(x for x in video_catalog['video_id'].unique()),
                item_features=(x for x in video_catalog['genre']))

# Build combined item features (genre, age, city, and state) for LightFM
item_features = lfm_dataset.build_item_features(
    ((row['video_id'], [row['genre'], str(row['age']), row['city'], row['state']])
     for _, row in video_catalog.iterrows())
)

# Train LightFM model
logging.info("Training LightFM model with combined item features.")
lfm_model = LightFM(loss='warp')
(interactions, weights) = lfm_dataset.build_interactions(
    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
)
lfm_model.fit(interactions, item_features=item_features, epochs=30, num_threads=2)

# Calculate Precision@5 for LightFM
precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
logging.info(f"LightFM Precision@5: {precision}")

# Time decay function to adjust ratings based on recency of interaction
def time_decay(timestamp, decay_rate=0.001):
    """Apply a time decay based on the age of the interaction."""
    days_ago = (datetime.now() - datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Automatically calculate engagement thresholds
logging.info("Calculating engagement thresholds based on dataset distribution.")
MIN_INTERACTIONS_FOR_ACTIVE = int(user_data.groupby('user_id').size().quantile(0.5))
HIGH_WATCH_THRESHOLD = user_data['watched'].quantile(0.75)
LOW_WATCH_THRESHOLD = user_data['watched'].quantile(0.25)

def classify_user_engagement(user_id):
    """Classify user engagement as 'new_user', 'low_engagement_user', or 'active_user' based on interaction patterns."""
    user_interactions = user_data[user_data['user_id'] == user_id]
    total_interactions = len(user_interactions)
    avg_watch = user_interactions['watched'].mean()
    total_skipped = user_interactions['skipped'].sum()

    if total_interactions < MIN_INTERACTIONS_FOR_ACTIVE:
        return "new_user"
    elif avg_watch < LOW_WATCH_THRESHOLD and total_skipped >= total_interactions / 2:
        return "low_engagement_user"
    else:
        return "active_user"

# Get unique user IDs and classify each one
user_profiles = {classify_user_engagement(user_id): user_id for user_id in user_data['user_id'].unique()}

def get_recommendations(user_id, n_recommendations=5, svd_weight=0.6, lightfm_weight=0.4):
    """Generate blended recommendations using SVD and LightFM, balancing various features."""
    recommendations = []

    # SVD-based Recommendations
    svd_recommendations = []
    for video in video_catalog['video_link']:
        try:
            est_rating = svd_model.predict(user_id, video).est
            svd_recommendations.append((video, est_rating * svd_weight))
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")
    svd_recommendations = sorted(svd_recommendations, key=lambda x: x[1], reverse=True)[:n_recommendations]

    # LightFM-based Recommendations by combined features
    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        lightfm_recommendations = sorted(
            [(video_catalog.iloc[i]['video_link'], scores[i] * lightfm_weight) for i in range(len(scores))],
            key=lambda x: x[1], reverse=True
        )
        recommendations += lightfm_recommendations[:n_recommendations]

    # Combine and limit to top recommendations, removing duplicates
    recommendations = list(dict.fromkeys([rec[0] for rec in svd_recommendations + recommendations]))
    return recommendations[:n_recommendations]

# Get recommendations for each classified user profile
for profile, user_id in user_profiles.items():
    logging.info(f"Generating recommendations for {profile} (user {user_id}).")
    recommendations = get_recommendations(user_id, n_recommendations=5)
    print(f"\nRecommendations for {profile} (user {user_id}):")
    print(recommendations)

# Get unique user IDs from dataset for input handling
unique_user_ids = user_data['user_id'].unique().tolist()

# Prompt user to enter a valid user ID for recommendations
user_input = input(f"Enter a user ID from the following options: {', '.join(unique_user_ids)} for recommendations: ")

# Check if entered user ID is in list of unique user IDs
if user_input in unique_user_ids:
    recommendations = get_recommendations(user_input, n_recommendations=5)
    print(f"Recommendations for user {user_input}:", recommendations)
else:
    print("Invalid user ID. Please enter a valid user ID from the list:", unique_user_ids)
