<a href="https://colab.research.google.com/github/khalil649/Big-Data/blob/main/notebooks/05.00-Big-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Big data


Loading dataset

In [11]:
!pip install pymongo

import pandas as pd
from pymongo import MongoClient
from urllib.parse import quote_plus
import datetime
import os
from google.colab import files
import shutil


# Encode username and password
username = quote_plus("KhalilNefzi2025")
password = quote_plus("Khalil@nefzi@2025")

# MongoDB Atlas URI (correct format)
uri = f"mongodb+srv://{username}:{password}@cluster0.oc8pqqj.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Connect to MongoDB Atlas
client = MongoClient(uri)

# ✅ Test de connexion (ajoute cette ligne ici)
print(client.server_info())  # Si ça plante ici → problème de connexion

# Créer un dossier nommé "data" s'il n'existe pas
folder_name = "data"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print("📁 Dossier 'data' créé avec succès.")
else:
    print("📂 Le dossier 'data' existe déjà.")


# Sélectionner et téléverser les fichiers
#uploaded = files.upload()

# Déplacer les fichiers uploadés vers le dossier "data"
#for filename in uploaded.keys():
    #shutil.move(filename, os.path.join(folder_name, filename))
    #print(f"✅ Fichier déplacé : {filename} → {folder_name}/{filename}")

# Select your database and collection
db = client['movielens']
movies_collection = db['movies']

# Load movies
movies_df = pd.read_csv('data/movies.csv')
movies_df['genres'] = movies_df['genres'].str.split('|')

# Load ratings
ratings_df = pd.read_csv('data/ratings.csv')
ratings_df['rating'] = ratings_df['rating'].astype(float)

# Summarize ratings
ratings_summary = ratings_df.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    total_ratings=('rating', 'count'),
    all_ratings=('rating', list)
).reset_index()

# Merge movies with ratings
movies_with_ratings = pd.merge(movies_df, ratings_summary, on='movieId', how='left')

# Convert to dictionary for MongoDB
movies_list = movies_with_ratings.to_dict(orient='records')

# Insert data into MongoDB Atlas
movies_collection.insert_many(movies_list)

print("✅ Data successfully loaded into MongoDB Atlas")
# === QUERY 1: Most Watched Movies by Year ===
ratings_df['year'] = pd.to_datetime(ratings_df['timestamp'], unit='s').dt.year
most_watched = ratings_df.groupby(['movieId', 'year']).size().reset_index(name='views')
top_movies = most_watched.sort_values(['year', 'views'], ascending=[True, False]).drop_duplicates('year')
top_movies = top_movies.merge(movies_df[['movieId', 'title']], on='movieId', how='left')
print("\n🎬 Most Watched Movies by Year:\n", top_movies[['year', 'title', 'views']])

# === QUERY 2: Average Rating per Genre ===
movies_with_ratings_exploded = movies_with_ratings.explode('genres')
avg_rating_per_genre = movies_with_ratings_exploded.groupby('genres')['avg_rating'].mean().sort_values(ascending=False)
print("\n⭐ Average Rating per Genre:\n", avg_rating_per_genre)

# === QUERY 3: Most Active Users ===
user_activity = ratings_df.groupby('userId').size().reset_index(name='num_ratings')
top_users = user_activity.sort_values('num_ratings', ascending=False).head(10)
print("\n👤 Most Active Users:\n", top_users)

# === QUERY 4: Similar Movies by Genre ===
def find_similar_movies_by_genre(target_movie_id, top_n=10):
    target_genres = movies_df.loc[movies_df['movieId'] == target_movie_id, 'genres'].values[0]
    def has_common_genre(genres): return any(g in target_genres for g in genres)
    similar_movies = movies_df[movies_df['movieId'] != target_movie_id]
    similar_movies = similar_movies[similar_movies['genres'].apply(has_common_genre)]
    similar_movies = pd.merge(similar_movies, ratings_summary, on='movieId', how='left')
    return similar_movies.sort_values('avg_rating', ascending=False).head(top_n)[['title', 'genres', 'avg_rating']]

print("\n🎞️ Similar Movies by Genre (like Toy Story):\n", find_similar_movies_by_genre(1))

# === QUERY 5: Content-Based Recommendation ===
def content_based_recommendation(movie_id, min_ratings=50, top_n=10):
    target_genres = movies_df.loc[movies_df['movieId'] == movie_id, 'genres'].values[0]
    def genre_overlap(genres): return len(set(genres) & set(target_genres))
    candidates = movies_df[movies_df['movieId'] != movie_id].copy()
    candidates['genre_overlap'] = candidates['genres'].apply(genre_overlap)
    candidates = pd.merge(candidates, ratings_summary, on='movieId', how='left')
    candidates = candidates[candidates['total_ratings'] > min_ratings]
    return candidates.sort_values(['genre_overlap', 'avg_rating'], ascending=[False, False]).head(top_n)[['title', 'genres', 'avg_rating']]

print("\n🎯 Content-Based Recommendations (like Toy Story):\n", content_based_recommendation(1))

