In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('input_data\\rating.csv')

In [4]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
# Convertir la colonne 'timestamp' en datetime si nécessaire
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Calculer l'âge en années
current_date = pd.Timestamp.now()
df['age'] = round((current_date - df['timestamp']).dt.days / 365.25, 0)

# Droper la colonne 'timestamp'
df = df.drop(columns=['timestamp'])

In [6]:
df.head(5)

Unnamed: 0,userId,movieId,rating,age
0,1,2,3.5,19.0
1,1,29,3.5,19.0
2,1,32,3.5,19.0
3,1,47,3.5,19.0
4,1,50,3.5,19.0


In [7]:
df.shape

(20000263, 4)

In [8]:
df.dtypes

userId       int64
movieId      int64
rating     float64
age        float64
dtype: object

In [9]:
df['userId'] = df['userId'].astype('int32')
df['movieId'] = df['movieId'].astype('int32')
df['rating'] = df['rating'].astype('float32')


In [10]:
df.dtypes

userId       int32
movieId      int32
rating     float32
age        float64
dtype: object

In [11]:
nan_counts = df.isna().sum()
nan_counts

userId     0
movieId    0
rating     0
age        0
dtype: int64

In [12]:
df['movieId'].nunique()

26744

In [13]:
df['userId'].nunique()

138493

In [14]:
user_splits = np.array_split(df['userId'].unique(), 10)

df_pivot_list = []
for split in user_splits:
    df_subset = df[df['userId'].isin(split)]
    df_pivot_subset = pd.pivot_table(df_subset, values='rating', index='userId', columns='movieId')
    df_pivot_list.append(df_pivot_subset)

df_pivot = pd.concat(df_pivot_list)


In [15]:
df_pivot.shape

(138493, 26744)

In [16]:
df_pivot.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,130586,130604,130614,130622,130656,130828,131110,131172,131237,131262
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


## Premier test de recommandation avec les poids sur les genres

In [19]:
imdb_weighted_encoded = pd.read_csv('output_data\\imdb_weighted_encoded_3.csv')
imdb_weighted_encoded.set_index('movieId', inplace = True)
imdb_weighted_encoded.head(5)

Unnamed: 0_level_0,originalTitle,imdbId,averageRating,youth_rate,popularity_rate,Action,Adventure,Animation,Biography,Comedy,...,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western,ImdbRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95541,Blacksmith Scene,5,6.2,0.121019,0.000956,0.0,0.0,0.0,0.0,0.000108,...,0.0,0.0,0.0,0.0,0.002488,0.0,0.0,0.0,0.0,0.033107
88674,Edison Kinetoscopic Record of a Sneeze,8,5.4,0.127389,0.000756,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002488,0.0,0.0,0.0,0.0,-0.766892
120869,La sortie de l'usine Lumière à Lyon,10,6.8,0.133758,0.002611,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002488,0.0,0.0,0.0,0.0,0.633108
98981,L'arrivée d'un train à La Ciotat,12,7.4,0.140127,0.00445,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002488,0.0,0.0,0.0,0.0,1.233108
113048,L'arroseur arrosé,14,7.1,0.133758,0.002019,0.0,0.0,0.0,0.0,0.000108,...,0.0,0.0,0.0,0.0,0.002488,0.0,0.0,0.0,0.0,0.933107


In [20]:
imdb_weighted_encoded_num = imdb_weighted_encoded.select_dtypes(include=[float, int])

In [21]:
def matching_genres(movie_title):
    # Trouver l'index du film cible
    i = int(imdb_weighted_encoded.index[imdb_weighted_encoded['originalTitle'] == movie_title][0])
    print('1')
    # Sélectionner les genres du film cible
    target_genres = imdb_weighted_encoded_num.loc[i]
    print('2')
    # Calculer la corrélation des genres avec le film cible
    correlations = imdb_weighted_encoded_num.apply(lambda row: row.corr(target_genres), axis=1)
    print('3')
    # Créer un DataFrame des résultats pour les genres
    df_genre = pd.DataFrame(correlations, columns=['PearsonG'])
    print('4')
    # Ajouter les titres des films
    df_genre = df_genre.join(imdb_weighted_encoded[['originalTitle']])
    print('5')
    # Trier les résultats par la corrélation des genres en ordre décroissant
    df_genre = df_genre.sort_values('PearsonG', ascending=False)
    df_genre.dropna(inplace=True)
    print('6')
    # Calculer la corrélation des notes avec le film cible
    target = df_pivot.loc[i]
    similar_to_target = df_pivot.corrwith(target)
    print('6.1')
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    print('6.2')
    corr_target.dropna(inplace=True)
    corr_target = corr_target.sort_values('PearsonR', ascending=False)
    print('6.3')
    corr_target.index = corr_target.index.map(int)
    print('7')
    # Fusionner les DataFrames de corrélations de genres et de notes
    df_combined = corr_target.join(df_genre.set_index(df_genre.index), how='inner')
    print('8')
    # Trier par corrélation de PearsonR et PearsonG
    df_combined = df_combined.sort_values(['PearsonR', 'PearsonG'], ascending=False)
    print('9')
    # Afficher les 20 meilleures recommandations
    print(df_combined.head(20).to_string(index=False))

# Appeler la fonction pour un exemple
matching_genres("Burn After Reading")


1
2
3
4
5
6


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


6.1
6.2
6.3
7
8
9
 PearsonR  PearsonG                                          originalTitle
      1.0       1.0                                           The Wackness
      1.0       1.0                                          Despicable Me
      1.0       1.0                                         The Ugly Truth
      1.0       1.0                                             Life of Pi
      1.0       1.0                      Standing in the Shadows of Motown
      1.0       1.0                                         Hable con ella
      1.0       1.0                                       The Tao of Steve
      1.0       1.0                                        The Independent
      1.0       1.0                                  But I'm a Cheerleader
      1.0       1.0                                          The Mod Squad
      1.0       1.0                                              The Limey
      1.0       1.0                                  Bringing Out the Dead
      1

## Deuxieme test de recommendation sans les poids sur les genres

In [26]:
imdb_encoded = pd.read_csv('output_data\\imdb_encoded_3.csv')
imdb_encoded.set_index('movieId', inplace = True)
imdb_encoded.head(5)

Unnamed: 0_level_0,originalTitle,imdbId,ImdbRating,youth_rate,popularity_rate,Action,Adventure,Animation,Biography,Comedy,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95541,Blacksmith Scene,5,0.033107,0.121019,0.000956,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
88674,Edison Kinetoscopic Record of a Sneeze,8,-0.766892,0.127389,0.000756,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
120869,La sortie de l'usine Lumière à Lyon,10,0.633108,0.133758,0.002611,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
98981,L'arrivée d'un train à La Ciotat,12,1.233108,0.140127,0.00445,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
113048,L'arroseur arrosé,14,0.933107,0.133758,0.002019,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [27]:
imdb_encoded_num = imdb_encoded.select_dtypes(include=[float, int])

In [28]:
def matching_genres(movie_title):
    # Trouver l'index du film cible
    i = int(imdb_encoded.index[imdb_encoded['originalTitle'] == movie_title][0])
    print('1')
    # Sélectionner les genres du film cible
    target_genres = imdb_encoded_num.loc[i]
    print('2')
    # Calculer la corrélation des genres avec le film cible
    correlations = imdb_encoded_num.apply(lambda row: row.corr(target_genres), axis=1)
    print('3')
    # Créer un DataFrame des résultats pour les genres
    df_genre = pd.DataFrame(correlations, columns=['PearsonG'])
    print('4')
    # Ajouter les titres des films
    df_genre = df_genre.join(imdb_encoded[['originalTitle']])
    print('5')
    # Trier les résultats par la corrélation des genres en ordre décroissant
    df_genre = df_genre.sort_values('PearsonG', ascending=False)
    df_genre.dropna(inplace=True)
    print('6')
    # Calculer la corrélation des notes avec le film cible
    target = df_pivot.loc[i]
    similar_to_target = df_pivot.corrwith(target)
    print('6.1')
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    print('6.2')
    corr_target.dropna(inplace=True)
    corr_target = corr_target.sort_values('PearsonR', ascending=False)
    print('6.3')
    corr_target.index = corr_target.index.map(int)
    print('7')
    # Fusionner les DataFrames de corrélations de genres et de notes
    df_combined = corr_target.join(df_genre.set_index(df_genre.index), how='inner')
    print('8')
    # Trier par corrélation de PearsonR et PearsonG
    df_combined = df_combined.sort_values(['PearsonR', 'PearsonG'], ascending=False)
    print('9')
    # Afficher les 20 meilleures recommandations
    print(df_combined.head(20).to_string(index=False))

# Appeler la fonction pour un exemple
matching_genres("Burn After Reading")


1
2
3
4
5
6


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


6.1
6.2
6.3
7
8
9
 PearsonR  PearsonG                     originalTitle
      1.0       1.0                      The Wackness
      1.0       1.0                    The Ugly Truth
      1.0       1.0                     Despicable Me
      1.0       1.0                        Life of Pi
      1.0       1.0                  The Tao of Steve
      1.0       1.0 Standing in the Shadows of Motown
      1.0       1.0                   The Independent
      1.0       1.0                    Hable con ella
      1.0       1.0             But I'm a Cheerleader
      1.0       1.0             Bringing Out the Dead
      1.0       1.0                         The Limey
      1.0       1.0            The General's Daughter
      1.0       1.0                        Half Baked
      1.0       1.0                       With Honors
      1.0       1.0                         She-Devil
      1.0       1.0                 Indecent Proposal
      1.0       1.0          The House of the Spirits
      1.0 