# Importation des bibliothèques et des données

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from skimpy import skim

In [14]:

#Création d'un dataframe par csv
genome_scores = pd.read_csv('input_data\genome_scores.csv')
genome_tags = pd.read_csv('input_data\genome_tags.csv')
link = pd.read_csv('input_data\link.csv')
movies = pd.read_csv('input_data\movie.csv')
ratings = pd.read_csv('input_data\\rating.csv')
tag = pd.read_csv('input_data\\tag.csv')


In [15]:
# Afficher les colonnes de chaque DataFrame
print("Colonnes de df_genome_scores:", genome_scores.columns.tolist())
print("Colonnes de df_genome_tags:", genome_tags.columns.tolist())
print("Colonnes de df_link:", link.columns.tolist())
print("Colonnes de df_movies:", movies.columns.tolist())
print("Colonnes de df_rating:", ratings.columns.tolist())
print("Colonnes de df_tag:", tag.columns.tolist())

Colonnes de df_genome_scores: ['movieId', 'tagId', 'relevance']
Colonnes de df_genome_tags: ['tagId', 'tag']
Colonnes de df_link: ['movieId', 'imdbId', 'tmdbId']
Colonnes de df_movies: ['movieId', 'title', 'genres']
Colonnes de df_rating: ['userId', 'movieId', 'rating', 'timestamp']
Colonnes de df_tag: ['userId', 'movieId', 'tag', 'timestamp']


In [16]:
skim(movies)

In [17]:
skim(ratings)

In [18]:
skim(genome_scores)

In [19]:
skim(genome_tags)

In [20]:
skim(link)

In [21]:
# Fusionner les évaluations et les films
data = pd.merge(ratings, movies, on='movieId')

In [22]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00,Up (2009),Adventure|Animation|Children|Drama
20000259,138493,69526,4.5,2009-12-03 18:31:48,Transformers: Revenge of the Fallen (2009),Action|Adventure|Sci-Fi|IMAX
20000260,138493,69644,3.0,2009-12-07 18:10:57,Ice Age: Dawn of the Dinosaurs (2009),Action|Adventure|Animation|Children|Comedy|Rom...
20000261,138493,70286,5.0,2009-11-13 15:42:24,District 9 (2009),Mystery|Sci-Fi|Thriller


In [23]:
data["movieId"].isna().sum()

0

# 1. Filtrage collaboratif simple

In [6]:
# Calculer la moyenne des évaluations pour chaque film
average_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()

# Ajouter les titres des films
average_ratings = pd.merge(average_ratings, movies, on='movieId')

# Recommander les meilleurs films
top_movies = average_ratings.sort_values(by='rating', ascending=False).head(10)
print(top_movies[['title', 'rating']])

                                                title  rating
19152               Barchester Chronicles, The (1982)     5.0
21842                            Only Daughter (2013)     5.0
17703                            Boys (Drenge) (1977)     5.0
21656                       Linotype: The Film (2012)     5.0
21658                             Rocaterrania (2009)     5.0
17556             Summer Wishes, Winter Dreams (1973)     5.0
25936                              Small Roads (2011)     5.0
21762  Year Zero: The Silent Death of Cambodia (1979)     5.0
21763                        Stealing a Nation (2004)     5.0
21840                                   B-Side (2013)     5.0


# 2. Filtrage basé sur la similarité des films

### Vérification des données

In [7]:
# Vérifier les valeurs manquantes
print(data.isnull().sum())

# Vérifier les types de données
print(data.dtypes)

# Vérifier le nombre unique d'utilisateurs et de films
print(f"Nombre d'utilisateurs uniques: {data['userId'].nunique()}")
print(f"Nombre de films uniques: {data['movieId'].nunique()}")

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64
userId         int64
movieId        int64
rating       float64
timestamp     object
title         object
genres        object
dtype: object
Nombre d'utilisateurs uniques: 138493
Nombre de films uniques: 26744


### Inspection du DataFrame

In [26]:
# Afficher un échantillon des données
#print(data.head())

# Vérifier les doublons
print(data.duplicated().sum())

0


### Essai sur un sous ensemble 

In [27]:
# Utiliser un petit échantillon de données pour tester
sample_data = data.head(1000)  # Ou une autre taille d'échantillon

# Créer une matrice utilisateur-film avec l'échantillon
try:
    user_movie_matrix = sample_data.pivot_table(index='userId', columns='title', values='rating')
    user_movie_matrix.fillna(0, inplace=True)
    print("Matrice utilisateur-film créée avec succès.")
except Exception as e:
    print(f"Erreur lors de la création de la matrice utilisateur-film : {e}")


Matrice utilisateur-film créée avec succès.


In [50]:
user_movie_matrix.head(3)

Unnamed: 0_level_0,'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### Réduction de la taille de la matrice

In [30]:
# Fixer un seuil pour le nombre minimum de notes par utilisateur et par film
min_user_ratings = 50
min_movie_ratings = 100

# Filtrer les utilisateurs et les films actifs
active_users = data['userId'].value_counts()
active_users = active_users[active_users >= min_user_ratings].index
active_movies = data['movieId'].value_counts()
active_movies = active_movies[active_movies >= min_movie_ratings].index

# Créer un sous-ensemble du DataFrame pour les utilisateurs et films actifs
filtered_data = data[(data['userId'].isin(active_users)) & (data['movieId'].isin(active_movies))]

# Créer la matrice utilisateur-film
user_movie_matrix = filtered_data.pivot_table(index='userId', columns='title', values='rating')

# Remplacer les valeurs manquantes par 0
user_movie_matrix.fillna(0, inplace=True)


### Gestion de la Mémoire et Optimisation

In [31]:
# Vérifier la taille de la matrice
print(f"Dimensions de la matrice utilisateur-film : {user_movie_matrix.shape}")

# Utiliser la mémoire efficacement (par exemple, en utilisant la précision float32 au lieu de float64)
user_movie_matrix = user_movie_matrix.astype(np.float32)


Dimensions de la matrice utilisateur-film : (85307, 8545)


In [32]:
# Calculer la similarité cosinus entre les films
item_similarity = cosine_similarity(user_movie_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

In [40]:
item_similarity_df

title,'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),...,Zulu (1964),[REC] (2007),[REC]² (2009),"\\""Great Performances\""\"" Cats (1998)""",eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Round Midnight (1986),1.000000,0.031480,0.028188,0.034825,0.068186,0.048036,0.032022,0.094780,0.022978,0.109058,...,0.050804,0.013801,0.018508,0.043746,0.030844,0.006882,0.023549,0.021756,0.041668,0.091125
'Salem's Lot (2004),0.031480,1.000000,0.020309,0.055325,0.029717,0.033198,0.089271,0.046733,0.002815,0.074161,...,0.014937,0.048306,0.071749,0.066190,0.017517,0.013555,0.026415,0.045970,0.025277,0.010735
'Til There Was You (1997),0.028188,0.020309,1.000001,0.056507,0.061669,0.020522,0.037814,0.043538,0.023374,0.048266,...,0.008465,0.001711,0.002707,0.021807,0.027952,0.001830,0.033114,0.022540,0.052628,0.012942
"'burbs, The (1989)",0.034825,0.055325,0.056507,1.000000,0.061129,0.076420,0.233865,0.102933,0.008211,0.113151,...,0.060353,0.046162,0.023313,0.026896,0.116980,0.043361,0.112046,0.042161,0.299700,0.024377
'night Mother (1986),0.068186,0.029717,0.061669,0.061129,1.000001,0.022189,0.031508,0.103543,0.015712,0.060976,...,0.037051,0.011558,0.012923,0.040745,0.037498,0.003649,0.015472,0.002967,0.053561,0.041853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),0.006882,0.013555,0.001830,0.043361,0.003649,0.054019,0.022089,0.019282,0.003337,0.018381,...,0.022530,0.063929,0.033482,0.014222,0.039808,1.000000,0.020539,0.022205,0.028222,0.022513
xXx (2002),0.023549,0.026415,0.033114,0.112046,0.015472,0.113465,0.115897,0.054295,0.004734,0.090426,...,0.089439,0.087130,0.048121,0.016301,0.178241,0.020539,1.000000,0.309820,0.165257,0.031831
xXx: State of the Union (2005),0.021756,0.045970,0.022540,0.042161,0.002967,0.062795,0.048753,0.027334,0.003316,0.031017,...,0.029140,0.054005,0.043905,0.006306,0.058306,0.022205,0.309820,1.000002,0.058589,0.007581
¡Three Amigos! (1986),0.041668,0.025277,0.052628,0.299700,0.053561,0.073385,0.132554,0.103292,0.003232,0.112094,...,0.091979,0.040338,0.018954,0.015900,0.142924,0.028222,0.165257,0.058589,1.000000,0.041499


# 3. K-Nearest Neighbors (KNN)

In [45]:
# Convertir la matrice utilisateur-film en float32
user_movie_matrix = user_movie_matrix.astype('float32')


## A jout des genres 

### Créer une Matrice Genre-Film

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convertir les genres en une liste de genres par film
genres_list = data['genres'].str.split('|').apply(lambda x: x if isinstance(x, list) else [])

# Appliquer le one-hot encoding aux genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(genres_list)
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=data['title'])

# Fusionner avec la matrice utilisateur-film
user_movie_matrix = user_movie_matrix.join(genres_df, how='left')


In [53]:
user_movie_matrix.head(5)

Unnamed: 0_level_0,'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [56]:
user_movie_matrix['IMAX'].unique().sum()

nan

### Combiner Notes et Genres

In [17]:
# Créer une matrice de caractéristiques combinée (notes + genres)
combined_features = user_movie_matrix.fillna(0)


In [24]:
combined_features["Mystery"]

Index([''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''burbs, The (1989)',
       ''night Mother (1986)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '1-900 (06) (1994)', '10 (1979)',
       ...
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object', length=8565)

### Appliquer KNN avec les Caractéristiques Combinées

In [18]:
from sklearn.neighbors import NearestNeighbors

# Créer le modèle KNN
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(combined_features.T)  # Chaque colonne représente un film

# Trouver les films les plus similaires à un film donné
movie_title = 'xXx (2002)'
if movie_title in combined_features.columns:
    movie_index = combined_features.columns.get_loc(movie_title)
    movie_vector = combined_features.iloc[:, movie_index].values.reshape(1, -1)

    # Trouver les voisins
    distances, indices = knn.kneighbors(movie_vector, n_neighbors=10)

    # Afficher les films similaires
    similar_movies = combined_features.columns[indices.flatten()]
    print("Films similaires à '{}':".format(movie_title))
    print(similar_movies)
else:
    print("Le film '{}' n'est pas présent dans la matrice.".format(movie_title))


Films similaires à 'xXx (2002)':
Index(['xXx (2002)', 'Fast and the Furious, The (2001)', 'Blade II (2002)',
       'Daredevil (2003)',
       'Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)',
       'Lara Croft: Tomb Raider (2001)', 'Die Another Day (2002)',
       'Transporter, The (2002)', 'Swordfish (2001)', 'S.W.A.T. (2003)'],
      dtype='object')


# 4. Modèles de machine learning (SVD)

### Amppliquer SVD

In [19]:
print("Nombre de valeurs manquantes dans la matrice utilisateur-film :")
print(user_movie_matrix.isna().sum().sum())

# Remplacer les valeurs manquantes par 0
user_movie_matrix.fillna(0, inplace=True)

# Convertir en float32 pour la factorisation
user_movie_matrix = user_movie_matrix.astype('float32')

# Appliquer TruncatedSVD
svd = TruncatedSVD(n_components=50)  # Ajustez le nombre de composants selon vos besoins
matrix_svd = svd.fit_transform(user_movie_matrix)
matrix_svd_reconstructed = np.dot(matrix_svd, svd.components_)

# Créer un DataFrame pour la matrice reconstruite
user_movie_matrix_reconstructed = pd.DataFrame(matrix_svd_reconstructed, index=user_movie_matrix.index, columns=user_movie_matrix.columns)


Nombre de valeurs manquantes dans la matrice utilisateur-film :
1706140


In [20]:
user_movie_matrix_reconstructed

Unnamed: 0_level_0,'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),10 (1979),...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.005536,0.026068,-0.034218,0.186016,0.019929,-0.379340,0.227934,-0.004071,0.005682,0.068516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.002583,-0.003388,0.023219,0.023421,0.000150,0.057781,-0.022853,0.041101,0.000239,0.000915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.025560,0.001311,-0.014494,0.163126,0.026355,0.242108,0.090564,0.082558,-0.006544,-0.005961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.008861,-0.000769,-0.014497,-0.129011,0.020740,-0.192382,-0.060014,0.041582,0.003227,-0.023192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.004433,-0.013958,0.140929,0.162015,0.055963,0.065226,-0.031644,0.230967,-0.010151,0.101151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138486,-0.033070,0.024104,-0.029562,0.266940,0.061784,-0.084441,0.091699,0.028187,0.011660,0.006580,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138487,0.012333,0.004110,0.028811,-0.033760,0.032528,0.055301,0.026494,0.050978,0.005363,0.031670,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138490,0.019779,0.004981,0.049048,-0.035373,0.097354,0.039622,0.037952,0.065225,0.016171,0.019287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138492,-0.001948,-0.000288,-0.026795,0.052249,-0.053139,0.256181,-0.010849,-0.045092,-0.006239,-0.011090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def recommend_top_10_movies_svd(df, user_id):
    """
    Recommande les 10 meilleurs films pour un utilisateur spécifique.
    
    Parameters:
    - df (pd.DataFrame): DataFrame avec une colonne 'userId' et les colonnes des scores des films.
    - user_id (int): L'identifiant de l'utilisateur pour lequel on veut faire des recommandations.
    
    Returns:
    - pd.Series: Une série contenant les 10 films les mieux notés pour l'utilisateur donné.
    """
    # Si userId est dans l'index, le ramener en tant que colonne
    if 'userId' not in df.columns:
        df = df.reset_index()
    
    # Assurez-vous que l'user_id est un entier
    user_id = int(user_id)
    
    # Vérifie si l'utilisateur est présent dans le DataFrame
    if user_id not in df['userId'].values:
        raise ValueError(f"L'utilisateur avec l'ID {user_id} n'existe pas dans le DataFrame.")

    # Sélectionne les scores des films pour l'utilisateur donné
    user_scores = df[df['userId'] == user_id].drop(columns=['userId']).iloc[0]

    # Trie les scores dans l'ordre décroissant pour obtenir les meilleurs films
    top_10_movies = user_scores.sort_values(ascending=False).head(10)
    
    return top_10_movies

# Test de la fonction avec un utilisateur donné
top_movies = recommend_top_10_movies_svd(user_movie_matrix_reconstructed, 5)
print(top_movies)


Star Wars: Episode IV - A New Hope (1977)                                         4.849162
Jurassic Park (1993)                                                              4.472060
Fugitive, The (1993)                                                              4.198931
Star Wars: Episode VI - Return of the Jedi (1983)                                 4.159707
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    3.865377
Star Wars: Episode V - The Empire Strikes Back (1980)                             3.833481
Forrest Gump (1994)                                                               3.818830
Lion King, The (1994)                                                             3.815785
Schindler's List (1993)                                                           3.722213
Apollo 13 (1995)                                                                  3.569160
Name: 3, dtype: float32


### Evaluation des performances

In [None]:
# def calculate_rmse(original, reconstructed):
#     mask = original > 0  # Masquer les valeurs manquantes
#     mse = mean_squared_error(original[mask], reconstructed[mask])
#     rmse = np.sqrt(mse)
#     return rmse

# rmse = calculate_rmse(user_movie_matrix, user_movie_matrix_reconstructed)
# print(f"RMSE: {rmse:.4f}")


# 5. Réseaux de neurones (Autoencoder)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import Sequence
import numpy as np

# Préparer les données
X = user_movie_matrix.values
input_dim = X.shape[1]

# Construire le modèle Autoencoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(32, activation='relu')(encoded)
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Entraîner le modèle
history = autoencoder.fit(X, X, epochs=10, batch_size=256, shuffle=True)

# Créer un générateur de données pour la prédiction en lot
class DataGenerator(Sequence):
    def __init__(self, data, batch_size=256):
        self.data = data
        self.batch_size = batch_size
        self.indices = np.arange(len(data))

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_data = self.data[batch_indices]
        return batch_data

# Générer les prédictions par petits lots
data_generator = DataGenerator(X, batch_size=256)
encoded_movies = autoencoder.predict(data_generator)


In [None]:
import matplotlib.pyplot as plt

# Afficher la perte en fonction des époques
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()


In [None]:
# Problèmes volume de données
import shap

# Créez un échantillon réduit de vos données pour calculer les valeurs SHAP
X_sample = X[np.random.choice(X.shape[0], 100, replace=False)]

# Créez un explainer SHAP pour les modèles basés sur le Deep Learning
explainer = shap.KernelExplainer(autoencoder.predict, X_sample)

# Calculez les valeurs SHAP
shap_values = explainer.shap_values(X_sample)

# Visualisation des valeurs SHAP
shap.summary_plot(shap_values, X_sample, feature_names=user_movie_matrix.columns)


In [None]:
# Accéder aux poids de la première couche dense (64 neurones)
weights, biases = autoencoder.layers[1].get_weights()

# Afficher les poids associés à la première couche dense
plt.figure(figsize=(12, 6))
plt.bar(range(len(weights)), np.mean(np.abs(weights), axis=1))
plt.xlabel('Features')
plt.ylabel('Mean Absolute Weight')
plt.title('Feature Importance from the First Dense Layer')
plt.show()


In [None]:
from sklearn.manifold import TSNE

# Réduire la dimension avec T-SNE
X_encoded = autoencoder.predict(X)
X_tsne = TSNE(n_components=2).fit_transform(X_encoded)

# Tracer les résultats
plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)
plt.title('T-SNE Visualization of Encoded Movie Features')
plt.xlabel('TSNE-1')
plt.ylabel('TSNE-2')
plt.show()


# Évaluation et comparaison des modèles

In [None]:
# Obtenir un échantillon de données
sample_size = 10000  # Vous pouvez ajuster cette taille selon vos besoins
sample_indices = np.random.choice(len(true_ratings), size=sample_size, replace=False)
true_ratings_sample = true_ratings[sample_indices]
pred_ratings_sample = pred_ratings[sample_indices]

# Calculer les métriques sur l'échantillon
rmse_simple, mae_simple = calculate_metrics(true_ratings_sample, pred_ratings_sample)

true_ratings = user_movie_matrix.values.flatten().astype(np.float32)
pred_ratings = np.repeat(average_ratings['rating'].mean(), len(true_ratings)).astype(np.float32)

def calculate_metrics_batch(true, pred, batch_size=10000):
    n = len(true)
    rmse_sum = 0
    mae_sum = 0
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        rmse_sum += np.sum((true[start:end] - pred[start:end]) ** 2)
        mae_sum += np.sum(np.abs(true[start:end] - pred[start:end]))
    rmse = np.sqrt(rmse_sum / n)
    mae = mae_sum / n
    return rmse, mae

rmse_simple, mae_simple = calculate_metrics_batch(true_ratings, pred_ratings)



In [None]:
# # Fonction pour calculer RMSE et MAE
# def calculate_metrics(true, pred):
#     rmse = np.sqrt(mean_squared_error(true, pred))
#     mae = mean_absolute_error(true, pred)
#     return rmse, mae

# # Placeholder pour les résultats
# results = []

# # Filtrage collaboratif simple
# true_ratings = user_movie_matrix.values.flatten()
# pred_ratings = np.repeat(average_ratings['rating'].mean(), len(true_ratings))
# rmse_simple, mae_simple = calculate_metrics(true_ratings, pred_ratings)
# results.append(('Simple Collaborative Filtering', rmse_simple, mae_simple))

# # Filtrage KNN
# _, indices = knn.kneighbors(user_movie_matrix.values)
# pred_knn = user_movie_matrix.values[indices].mean(axis=1).flatten()
# rmse_knn, mae_knn = calculate_metrics(true_ratings, pred_knn)
# results.append(('KNN', rmse_knn, mae_knn))

# # SVD
# results.append(('SVD', rmse_svd, mae_svd))

# # Résultats
# results_df = pd.DataFrame(results, columns=['Method', 'RMSE', 'MAE'])
# print(results_df)


# Visualisation des résultats

In [None]:
# Visualisation des résultats
results_df.plot(x='Method', y=['RMSE', 'MAE'], kind='bar')
plt.title('Performance Comparison')
plt.xlabel('Method')
plt.ylabel('Error')
plt.show()