In [None]:
import pandas as pd

In [None]:
# URL of the raw dataset file on GitHub
url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"

# Load the data into a DataFrame
data = pd.read_json(url)

# Display the first few rows to verify
print(data.head())


   user_id country         city state  age  \
0        1     USA  Los Angeles    CA   30   
1        2     USA     New York    NY   30   
2        3     USA  Los Angeles    CA   18   
3        4  Canada      Toronto    ON   40   
4        5  Canada    Vancouver    BC   18   

                                    video_link      genre  watched  liked  \
0  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3  False   
1  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      0.5  False   
2  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      0.3   True   
3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical      0.1  False   
4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop      0.1  False   

   skipped  
0     True  
1     True  
2     True  
3     True  
4     True  


In [None]:
# URL of the video catalog file on GitHub

url = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/Video_catalog.json"
video_catalog = pd.read_json(url)

# Display the first few rows to verify
print(video_catalog.head())


   video_id                                   video_link      genre  country  \
0         1  https://www.youtube.com/watch?v=D4vN_5MBEog    hip hop      USA   
1         2  https://www.youtube.com/watch?v=7iqMNnzQPmY     ballet      USA   
2         3  https://www.youtube.com/watch?v=p0VGHuaICyI  classical   Canada   
3         4   https://www.youtube.com/shorts/fv5vCREiBMQ      k pop   Canada   
4         5   https://www.youtube.com/shorts/kF0MRowRcIM    African  Nigeria   

          city age_group  
0  Los Angeles     18-35  
1     New York     18-35  
2      Toronto     35-50  
3    Vancouver     18-25  
4         Kano     35-50  


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from lightfm import LightFM
from lightfm.data import Dataset as LightFMDataset
from lightfm.evaluation import precision_at_k
import numpy as np
import logging
from datetime import datetime

# Configure logging for errors only
logging.basicConfig(filename="recommendation_system.log", level=logging.ERROR)

# Define URL parameters for flexibility
USER_DATA_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/refs/heads/main/dataset/RS_Fakedata-7-35_users.json"
VIDEO_CATALOG_URL = "https://raw.githubusercontent.com/javeriaz15/Video-Recommendation/main/dataset/Video_catalog.json"

# Load data with error handling and retry mechanism
def load_data(url):
    try:
        return pd.read_json(url)
    except Exception as e:
        logging.error(f"Error loading data from {url}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if loading fails

user_data = load_data(USER_DATA_URL)
video_catalog = load_data(VIDEO_CATALOG_URL)

# Recommendation model weights
SVD_WEIGHT = 0.6
LIGHTFM_WEIGHT = 0.4

# Collaborative filtering with Surprise SVD model
reader = Reader(rating_scale=(0, 1))
surprise_data = Dataset.load_from_df(user_data[['user_id', 'video_link', 'liked']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)

# Predict and calculate RMSE
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)
logging.info(f"SVD Model RMSE: {rmse}")

# Encode video features for LightFM model
video_catalog['video_id'] = video_catalog['video_link'].factorize()[0]
user_data['user_id'] = user_data['user_id'].astype(str)
user_data['video_id'] = user_data['video_link'].map(video_catalog.set_index('video_link')['video_id'])

# Initialize LightFM dataset and build combined item features
lfm_dataset = LightFMDataset()
lfm_dataset.fit(users=(x for x in user_data['user_id'].unique()),
                items=(x for x in video_catalog['video_id'].unique()),
                item_features=(x for x in video_catalog['genre']))

# Extract features with optimized apply function
def extract_item_features(df, feature_cols):
    return list(zip(df['video_id'], df[feature_cols].apply(lambda x: list(map(str, x)), axis=1)))

feature_columns = ['genre', 'age', 'city', 'state']
item_features = lfm_dataset.build_item_features(extract_item_features(video_catalog, feature_columns))

# LightFM model training with reduced epochs
lfm_model = LightFM(loss='warp')
(interactions, weights) = lfm_dataset.build_interactions(
    ((str(row['user_id']), row['video_id']) for _, row in user_data.iterrows())
)
lfm_model.fit(interactions, item_features=item_features, epochs=15, num_threads=4)

# Precision@5 for LightFM
precision = precision_at_k(lfm_model, interactions, item_features=item_features, k=5).mean()
logging.info(f"LightFM Precision@5: {precision}")

# Time decay function to adjust ratings
def time_decay(timestamp, decay_rate=0.001):
    days_ago = (datetime.now() - datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S")).days
    return np.exp(-decay_rate * days_ago)

# Apply time decay to liked ratings
user_data['decayed_liked'] = user_data.apply(
    lambda row: row['liked'] * time_decay(row['timestamp']) if row['liked'] else 0, axis=1
)

# Calculate engagement thresholds
MIN_INTERACTIONS_FOR_ACTIVE = int(user_data.groupby('user_id').size().quantile(0.5))
HIGH_WATCH_THRESHOLD = user_data['watched'].quantile(0.75)
LOW_WATCH_THRESHOLD = user_data['watched'].quantile(0.25)

# Engagement classifier
user_interactions_cache = user_data.groupby('user_id').apply(lambda x: {
    'total_interactions': len(x),
    'avg_watch': x['watched'].mean(),
    'total_skipped': x['skipped'].sum()
}).to_dict()

def classify_user_engagement(user_id):
    user_metrics = user_interactions_cache.get(user_id, {})
    total_interactions = user_metrics.get('total_interactions', 0)
    avg_watch = user_metrics.get('avg_watch', 0)
    total_skipped = user_metrics.get('total_skipped', 0)

    if total_interactions < MIN_INTERACTIONS_FOR_ACTIVE:
        return "new_user"
    elif avg_watch < LOW_WATCH_THRESHOLD and total_skipped >= total_interactions / 2:
        return "low_engagement_user"
    else:
        return "active_user"

user_profiles = {classify_user_engagement(user_id): user_id for user_id in user_data['user_id'].unique()}

# Generate recommendations
def get_recommendations(user_id, n_recommendations=5, svd_weight=SVD_WEIGHT, lightfm_weight=LIGHTFM_WEIGHT):
    combined_scores = {}
    unwatched_videos = video_catalog[~video_catalog['video_link'].isin(user_data[user_data['user_id'] == user_id]['video_link'])]

    for video in unwatched_videos['video_link']:
        try:
            est_rating = svd_model.predict(user_id, video).est
            combined_scores[video] = combined_scores.get(video, 0) + est_rating * svd_weight
        except Exception as e:
            logging.error(f"Error predicting rating for user {user_id} and video {video}: {e}")

    user_index = lfm_dataset.mapping()[0].get(str(user_id))
    if user_index is not None:
        scores = lfm_model.predict(user_index, np.arange(len(video_catalog)), item_features=item_features)
        for i, score in enumerate(scores):
            video = video_catalog.iloc[i]['video_link']
            combined_scores[video] = combined_scores.get(video, 0) + score * lightfm_weight

    recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return [rec[0] for rec in recommendations]

unique_user_ids = user_data['user_id'].unique().tolist()
user_input = input(f"Enter a user ID from the following options: {', '.join(unique_user_ids)} for recommendations: ")

if user_input in unique_user_ids:
    recommendations = get_recommendations(user_input, n_recommendations=5)
    print(f"Recommendations for user {user_input}:", recommendations)
else:
    print("Invalid user ID. Please enter a valid user ID from the list:", unique_user_ids)

