<a href="https://colab.research.google.com/github/laylamoguibmm/Tasks/blob/main/task5_elevvopaths.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ===============================
# 1. IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from collections import defaultdict

# ===============================
# 2. LOAD MOVIELENS DATASET
# ===============================
ratings_file = "/content/u.data"
movies_file = "/content/u.item"

# Load ratings
ratings_columns = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_file, sep='\t', names=ratings_columns, encoding='latin-1')

# Load movie titles
movies_columns = ['movieId', 'title', 'release_date', 'video_release', 'IMDb_url', 'unknown', 'Action', 'Adventure', 'Animation',
                  'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                  'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv(movies_file, sep='|', names=movies_columns, encoding='latin-1')

# Merge ratings and movies
data = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')
print("Dataset Shape:", data.shape)
print(data.head())

# ===============================
# 3. CREATE USER-ITEM MATRIX
# ===============================
user_item_matrix = data.pivot_table(index='userId', columns='title', values='rating')
print("User-Item Matrix Shape:", user_item_matrix.shape)

# ===============================
# 4. ITEM-BASED COLLABORATIVE FILTERING
# ===============================
# Compute cosine similarity between items (movies)
item_similarity = cosine_similarity(user_item_matrix.T.fillna(0))
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

def recommend_similar_movies(movie_name, top_n=10):
    if movie_name not in item_similarity_df:
        return "Movie not found!"
    similar_scores = item_similarity_df[movie_name].sort_values(ascending=False)
    return similar_scores.iloc[1:top_n+1].index.tolist()

print("Similar movies to 'Toy Story (1995)':")
print(recommend_similar_movies("Toy Story (1995)", top_n=5))

# ===============================
# 5. USER-BASED RECOMMENDATION (TOP UNSEEN MOVIES)
# ===============================
user_similarity = cosine_similarity(user_item_matrix.fillna(0))
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def recommend_for_user(user_id, top_n=10):
    if user_id not in user_similarity_df:
        return "User not found!"
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:6].index
    # Collect movies rated by similar users
    similar_users_ratings = user_item_matrix.loc[similar_users].mean().sort_values(ascending=False)
    # Filter out movies the user has already rated
    movies_rated = user_item_matrix.loc[user_id].dropna().index
    recommendations = similar_users_ratings.drop(movies_rated).head(top_n)
    return recommendations.index.tolist()

print("Recommended movies for User 1:")
print(recommend_for_user(1, top_n=5))

# ===============================
# 6. MATRIX FACTORIZATION (SVD)
# ===============================
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.2)

svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

# ===============================
# 7. EVALUATE PRECISION@K
# ===============================
def precision_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]
        precisions[uid] = sum((true_r >= threshold) for (_, true_r) in top_k) / k

    return sum(prec for prec in precisions.values()) / len(precisions)

print("Precision@10:", precision_at_k(predictions, k=10))


Dataset Shape: (100000, 5)
   userId  movieId  rating  timestamp                       title
0     196      242       3  881250949                Kolya (1996)
1     186      302       3  891717742    L.A. Confidential (1997)
2      22      377       1  878887116         Heavyweights (1994)
3     244       51       2  880606923  Legends of the Fall (1994)
4     166      346       1  886397596         Jackie Brown (1997)
User-Item Matrix Shape: (943, 1664)
Similar movies to 'Toy Story (1995)':
['Star Wars (1977)', 'Return of the Jedi (1983)', 'Independence Day (ID4) (1996)', 'Rock, The (1996)', 'Mission: Impossible (1996)']
Recommended movies for User 1:
['Secrets & Lies (1996)', 'Alien: Resurrection (1997)', 'Hamlet (1996)', 'Shadowlands (1993)', 'Some Folks Call It a Sling Blade (1993)']
Precision@10: 0.5769883351007423
