In [None]:
import pandas as pd

In [None]:
# URL of the raw dataset file on GitHub
url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"

# Load the data into a DataFrame
data = pd.read_json(url)

# Display the first few rows to verify
print(data.head())


   user_id country         city state  age  \
0        1     USA  Los Angeles    CA   30   
1        2     USA     New York    NY   30   
2        3     USA  Los Angeles    CA   18   
3        4  Canada      Toronto    ON   40   
4        5  Canada    Vancouver    BC   18   

                                    video_link      genre  watched  liked  \
0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   

   skipped  
0     True  
1     True  
2     True  
3     True  
4     True  


In [None]:
# URL of the video catalog file on GitHub

url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"
video_catalog = pd.read_json(url)

# Display the first few rows to verify
print(video_catalog.head())


   video_id                                   video_link      genre  country  \
0         1  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      USA   
1         2  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      USA   
2         3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical   Canada   
3         4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop   Canada   
4         5   https://www.youtube.com/shorts/kF0MRowRcIM    African  Nigeria   

          city age_group  
0  Los Angeles     18-35  
1     New York     18-35  
2      Toronto     35-50  
3    Vancouver     18-25  
4         Kano     35-50  


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
from datetime import datetime

# Load the user interaction data and video catalog

user_data_url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
video_catalog_url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/main/dataset/Video_catalog.json"
user_data = pd.read_json(user_data_url)
video_catalog = pd.read_json(video_catalog_url)

# Configure logging to save error messages in a file
logging.basicConfig(filename="svd_predictions.log", level=logging.ERROR)

# Set up Surprise model for collaborative filtering (SVD-based)
reader = Reader(rating_scale=(0, 1))  # assuming binary (liked or not)
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)

# Split the dataset into training and testing sets for SVD
trainset, testset = train_test_split(surprise_data, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)

# Predict on the test set and calculate RMSE
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"SVD Model RMSE: {rmse}")

# Encode video genres, age, city, and state for content-based filtering in LightFM
user_data['user_id'] = user_data['user_id'].astype(str)
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])
user_ids = user_data['user_id'].unique()
video_ids = video_catalog['video_id'].unique()

# Prepare LightFM data with demographic information (age, location, genre)
lfm_dataset = LightFMDataset()
lfm_dataset.fit(users=(x for x in user_ids),
                items=(x for x in video_ids),
                item_features=(x for x in video_catalog['genre']))

# Now build item features with genre, age, city, and state included
item_features = lfm_dataset.build_item_features(
    ((row['video_id'], [row['genre'], str(row['age']), row['city'], row['state']])
     for _, row in video_catalog.iterrows())
)

# Train the LightFM model
lfm_model = LightFM(loss='warp')
(interactions, weights) = lfm_dataset.build_interactions(
    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
)
lfm_model.fit(interactions, item_features=item_features, epochs=30, num_threads=2)

# Calculate Precision@5 for LightFM
precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
print(f"LightFM Precision@5: {precision}")

# Time decay function for adjusting ratings
def time_decay(timestamp, decay_rate=0.001):
    days_ago = (datetime.now() - datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Define a function to generate recommendations based on factors and blend them
def get_recommendations(user_id, n_recommendations=5, svd_weight=0.6, lightfm_weight=0.4):
    recommendations = []

    # SVD-based Recommendations (Interaction)
    svd_recommendations = []
    for video in video_catalog['video_link']:
        try:
            est_rating = svd_model.predict(user_id, video).est
            svd_recommendations.append((video, est_rating * svd_weight))
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")
    svd_recommendations = sorted(svd_recommendations, key=lambda x: x[1], reverse=True)[:n_recommendations]

    # LightFM Recommendations by Genre
    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        genre_scores = lfm_model.predict(user_index, np.arange(len(video_ids)), item_features=item_features)
        genre_recommendations = sorted([(video_catalog.iloc[i]['video_link'], genre_scores[i])
                                        for i in range(len(genre_scores))], key=lambda x: x[1], reverse=True)
        recommendations += genre_recommendations[:n_recommendations]

    # LightFM Recommendations by Age Group
    age_features = lfm_dataset.build_item_features(
        ((row['video_id'], [str(row['age'])]) for _, row in video_catalog.iterrows())
    )
    age_scores = lfm_model.predict(user_index, np.arange(len(video_ids)), item_features=age_features)
    age_recommendations = sorted([(video_catalog.iloc[i]['video_link'], age_scores[i])
                                  for i in range(len(age_scores))], key=lambda x: x[1], reverse=True)
    recommendations += age_recommendations[:n_recommendations]

    # LightFM Recommendations by Location (City and State)
    location_features = lfm_dataset.build_item_features(
        ((row['video_id'], [row['city'], row['state']]) for _, row in video_catalog.iterrows())
    )
    location_scores = lfm_model.predict(user_index, np.arange(len(video_ids)), item_features=location_features)
    location_recommendations = sorted([(video_catalog.iloc[i]['video_link'], location_scores[i])
                                       for i in range(len(location_scores))], key=lambda x: x[1], reverse=True)
    recommendations += location_recommendations[:n_recommendations]

    # Final Blended Recommendations, removing duplicates and limiting to top N
    recommendations = list(dict.fromkeys([rec[0] for rec in recommendations]))
    return recommendations[:n_recommendations]

# Testing for Multiple User Profiles
user_profiles = {
    "active_user": "1",       # Example user ID for active user
    "new_user": "2",          # Example user ID for new user with limited interactions
    "low_engagement_user": "3"  # Example user ID with low watch or like rates
}

# Get recommendations for each user profile
for profile, user_id in user_profiles.items():
    print(f"\nRecommendations for {profile}:")
    recommendations = get_recommendations(user_id, n_recommendations=5)
    print(recommendations)

# Example: Getting recommendations for a specific user by user input
user_input = input("Enter a user ID between 1 and 5 for recommendations: ")
if user_input in ["1", "2", "3", "4", "5"]:
    new_user_id = user_input
    recommendations = get_recommendations(new_user_id, n_recommendations=5)
    print(f"Recommendations for user {new_user_id}:", recommendations)
else:
    print("Invalid user ID. Please enter a number between 1 and 5.")
