# Part 1: User-Based Collaborative Filtering

---

## Overview

This notebook implements part 1: user-based collaborative filtering.

---

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

---

## Load Data

In [None]:
# Load dataset
# TODO: Add data loading code here

In [None]:
# section 2 part 1 case study 1 point 10
#on question 10 , can we trust the users who rated only the 
#common items more than the ones that rated more than the common items   ?

In [None]:
# adjustment to main py file case_study_1.py

# 10. Trust: Common items vs More items
    print("\n10. Trust Analysis (Common vs More Items):")

  # This is a discussion point, but we can find examples.
    # Find a neighbor with high overlap but low total items vs low overlap but high total items?
    # Or rather: "rated the common items" (subset) vs "rated more items than the common items" (superset)
    # If User A has items {1,2} and User B has {1,2,3,4}. If they agree on {1,2}, sim is 1.0.
    # If User C has {1,2} and agrees.
    # We can just print some stats for the perfect neighbors above.
    
    # 11. Low ratings but high cosine
    for target_user in target_users:
        print(f"\n  Analyzing Target User: {target_user}")
        neighbors = results[target_user]['neighbors_raw']
        
        subset_neighbors = []
        superset_neighbors = []
        
        target_items = set(all_users_ratings[target_user].keys())
        
        for neighbor_id, score in neighbors:
            neighbor_items = set(all_users_ratings[neighbor_id].keys())
            common = target_items & neighbor_items
            
            # Check for strict subset (neighbor rated ONLY common items)
            if len(neighbor_items) == len(common):
                subset_neighbors.append((neighbor_id, score, len(common), len(neighbor_items)))
            elif len(neighbor_items) > len(common):
                superset_neighbors.append((neighbor_id, score, len(common), len(neighbor_items)))
                
        print(f"    Subset Neighbors (Rated ONLY common items): {len(subset_neighbors)}")
        if subset_neighbors:
            print(f"    Top 5 Subset: {subset_neighbors[:5]}")
            
        print(f"    Superset Neighbors (Rated MORE than common items): {len(superset_neighbors)}")
        if superset_neighbors:
            # Sort by count of extra items (total - common) desc ?
            # Or just show first few high similarity ones
            print(f"    Top 5 Superset: {superset_neighbors[:5]}")

    print("\n11. Low Ratings High Cosine Analysis:")
    # Check if any strong neighbor has low average rating but high similarity?
    u_ex = target_users[0]
    perfect_neighbors = [n for n in results[u_ex]['neighbors_raw'] if abs(n[1] - 1.0) < 1e-5]
    for pid, score in perfect_neighbors[:5]:
        p_ratings = list(all_users_ratings[pid].values())
        p_avg = sum(p_ratings)/len(p_ratings)
        t_ratings = list(all_users_ratings[u_ex].values())
        t_avg = sum(t_ratings)/len(t_ratings)
        print(f"  User {pid}: Avg Rating={p_avg:.2f} vs Target Avg={t_avg:.2f} (Sim={score:.2f})")

    print("\nAnalysis Complete.")


# Check if any perfect neighbor has low average rating but high similarity?
    # Cosine is scale-independent? No, raw cosine is NOT scale independent. 
    # Raw cosine of (1,1) and (5,5) -> (1*5 + 1*5) / (sqrt(2)*sqrt(50)) = 10 / (1.414 * 7.07) = 10 / 10 = 1.0
    # So yes, if one user rates everything 1 and another rates everything 5, raw cosine is 1.0.
    # Let's verify this with data.
        # Check if any strong neighbor has low average rating but high similarity?
    u_ex = target_users[0]
    perfect_neighbors = [n for n in results[u_ex]['neighbors_raw'] if abs(n[1] - 1.0) < 1e-5]
    for pid, score in perfect_neighbors[:5]:
        p_ratings = list(all_users_ratings[pid].values())
        p_avg = sum(p_ratings)/len(p_ratings)
        t_ratings = list(all_users_ratings[u_ex].values())
        t_avg = sum(t_ratings)/len(t_ratings)
        print(f"  User {pid}: Avg Rating={p_avg:.2f} vs Target Avg={t_avg:.2f} (Sim={score:.2f})")

    print("\nAnalysis Complete.")


if __name__ == "__main__":
    main()

In [None]:
# seperate script created by model to analyze q1_10_targeted

import sys
import os
import pandas as pd

# Add path for utils
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from utils import data_loader

def get_user_ratings(df, user_id):
    user_df = df[df['user'] == user_id]
    return dict(zip(user_df['item'], user_df['rating']))

def main():
    print("Loading data...")
    df = data_loader.get_preprocessed_dataset()
    
    # Target Users and their known Top Raw Neighbors (from output_case_study_1.txt)
    # We use the neighbors found in Step 2 (Raw Cosine) as they are the most relevant for this "trust" analysis
    targets_and_neighbors = {
        134471: [131078, 131084, 98491, 223, 66017],
        27768: [8629, 16611, 20630, 217, 41956],
        16157: [5285, 49306, 17495, 50194, 49260]
    }
    
    print("\n10. Trust Analysis (Common vs More Items) for Top Neighbors:")
    
    for target_user, neighbors in targets_and_neighbors.items():
        print(f"\nTarget User: {target_user}")
        target_ratings = get_user_ratings(df, target_user)
        target_items = set(target_ratings.keys())
        print(f"  Target Items Count: {len(target_items)}")
        
        for neighbor_id in neighbors:
            neighbor_ratings = get_user_ratings(df, neighbor_id)
            neighbor_items = set(neighbor_ratings.keys())
            
            common = target_items & neighbor_items
            
            is_subset = len(neighbor_items) == len(common)
            
            print(f"  Neighbor {neighbor_id}:")
            print(f"    Total Items Rated: {len(neighbor_items)}")
            print(f"    Common Items: {len(common)}")
            print(f"    Is Subset (Rated ONLY common)? {is_subset}")
            
            if is_subset:
                print("    -> SUBSET NEIGHBOR (Trust Issue: High similarity based on scant evidence)")
            else:
                extra = len(neighbor_items) - len(common)
                print(f"    -> SUPERSET NEIGHBOR (Rated {extra} additional items)")

if __name__ == "__main__":
    main()
