In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('input_data\\rating.csv')

In [3]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [3]:
# Droper la colonne 'timestamp'
df = df.drop(columns=['timestamp'])

# Mapping 1 and -1 into ratings above and below threshold (4)
df['rating'] = df['rating'].map(lambda x: 1 if x >= 4 else -1)


In [4]:
df.head(5)

Unnamed: 0,userId,movieId,rating
0,1,2,-1
1,1,29,-1
2,1,32,-1
3,1,47,-1
4,1,50,-1


In [5]:
df['rating'] = df['rating'].astype('Int8')

In [17]:
df.dtypes

userId     int64
movieId    int64
rating      Int8
dtype: object

In [6]:
df.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [12]:
df['movieId'].nunique()

26744

In [13]:
df['userId'].nunique()

138493

#### Pivoting the DataFrame into a Binarized Factorization Matrix

In [6]:
user_splits = np.array_split(df['userId'].unique(), 10)

df_pivot_list = []
for split in user_splits:
    df_subset = df[df['userId'].isin(split)]
    df_pivot_subset = pd.pivot_table(df_subset, values='rating', index='userId', columns='movieId')
    df_pivot_list.append(df_pivot_subset)
    print("New split on duty !")

df_pivot = pd.concat(df_pivot_list)


New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !


In [7]:
df_pivot.shape

(138493, 26744)

In [8]:
df_pivot.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,130586,130604,130614,130622,130656,130828,131110,131172,131237,131262
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-1.0,,,,,,,,,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,


## Premier test de recommandation avec les genres

In [23]:
imdb_encoded = pd.read_csv('output_data\\imdb_encoded_3.csv')
imdb_encoded.set_index('movieId', inplace = True)
imdb_encoded.head(2)

Unnamed: 0_level_0,Unnamed: 0,title,avg_movie_rating,movie_youth_rate,movie_popularity_rate,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,toy story,3.92124,0.591837,0.552906,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,jumanji,3.211977,0.591837,0.285514,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
imdb_encoded = imdb_encoded.drop(columns=['avg_movie_rating', 'movie_youth_rate','movie_popularity_rate'])
imdb_encoded.shape

(26689, 22)

In [25]:
imdb_encoded_num = imdb_encoded.select_dtypes(include=[float, int])

In [26]:
def matching_genres(movie_title):
    # Trouver l'index du film cible
    i = int(imdb_encoded.index[imdb_encoded['title'] == movie_title][0])
    print('1')
    # Sélectionner les genres du film cible
    target_genres = imdb_encoded_num.loc[i]
    print('2')
    # Calculer la corrélation des genres avec le film cible
    correlations = imdb_encoded_num.apply(lambda row: row.corr(target_genres), axis=1)
    print('3')
    # Créer un DataFrame des résultats pour les genres
    df_genre = pd.DataFrame(correlations, columns=['PearsonG'])
    print('4')
    # Ajouter les titres des films
    df_genre = df_genre.join(imdb_encoded[['title']])
    print('5')
    # Trier les résultats par la corrélation des genres en ordre décroissant
    df_genre = df_genre.sort_values('PearsonG', ascending=False)
    df_genre.dropna(inplace=True)
    print('6')
    # Calculer la corrélation des notes avec le film cible
    target = df_pivot.loc[i]
    similar_to_target = df_pivot.corrwith(target)
    print('7')
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    print('8')
    corr_target.dropna(inplace=True)
    corr_target = corr_target.sort_values('PearsonR', ascending=False)
    print('9')
    corr_target.index = corr_target.index.map(int)
    print('10')
    # Fusionner les DataFrames de corrélations de genres et de notes
    df_combined = corr_target.join(df_genre.set_index(df_genre.index), how='inner')
    print('11')
    # Trier par corrélation de PearsonR et PearsonG
    df_combined = df_combined.sort_values(['PearsonR', 'PearsonG'], ascending=False)
    print('12')
    # Afficher les 20 meilleures recommandations
    print(df_combined.head(20).to_string(index=False))

# Appeler la fonction pour un exemple
matching_genres("rentun ruusu")


1
2
3
4
5
6


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[N

6.1
6.2
6.3
7
8
9
 PearsonR  PearsonG                                                 title
      1.0  1.000000                                           scary movie
      1.0  1.000000                                          jackie brown
      1.0  1.000000                                 lord of the rings the
      1.0  0.999999                                     boot das boat the
      1.0  1.000000        maria full of grace maria llena eres de gracia
      1.0  1.000000                                               hidalgo
      1.0  1.000000                                       boyz n the hood
      1.0  1.000000                            dragon the bruce lee story
      1.0  1.000000                                         love actually
      1.0  1.000000                                   save the last dance
      1.0  1.000000                                               my girl
      1.0  1.000000                                           sandlot the
      1.0  1.000000 