In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('input_data\\rating.csv')

In [3]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
# Convertir la colonne 'timestamp' en datetime si nécessaire
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Calculer l'âge en années
current_date = pd.Timestamp.now()
df['age'] = round((current_date - df['timestamp']).dt.days / 365.25, 0)

# Droper la colonne 'timestamp'
df = df.drop(columns=['timestamp'])

In [5]:
# Normaliser l'âge pour obtenir le taux de jeunesse (0 à 1)
# Ici, l'âge le plus vieux devient 0 et l'âge le plus jeune devient 1
df['rating_youth_rate'] = 1 - (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())

# Droper la colonne 'age'
df = df.drop(columns=['age'])
df.head(3)

Unnamed: 0,userId,movieId,rating,rating_youth_rate
0,1,2,3.5,0.52381
1,1,29,3.5,0.52381
2,1,32,3.5,0.52381


In [6]:
df['rating'] = df['rating'].astype('float32')
df['rating_youth_rate'] = df['rating_youth_rate'].astype('float32')

In [7]:
df_movie = pd.read_csv('input_data\\movie.csv')

In [8]:
df_movie.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


##### Splitting genres and Extracting title and year data for a easier use later on

In [9]:
# Utiliser str.extract pour séparer le titre et l'année
df_movie[['title', 'year']] = df_movie['title'].str.extract(r'^(.*)\s\((\d{4})\)$')

##### Splitting genres and Extracting title and year data for a easier use later on
# One-hot encoding des genres
df_genres_encoded = df_movie['genres'].str.get_dummies(sep='|')

# Concatenation avec le DataFrame original
df_movie = pd.concat([df_movie, df_genres_encoded], axis=1)
df_movie = df_movie.drop(['genres'], axis=1)
df_movie.head(3)

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


##### Dropping a few NaN in titles, and transforming date in age to later create youth_rate for movies

In [10]:
# Drop les lignes sans titles ni years (55 en tout)
df_movie = df_movie.dropna(subset=['title'])

# Convertir la colonne 'year' en entier
df_movie['year'] = df_movie['year'].astype(int)

# Creation de la colonne age_movie
df_movie['age'] = 2024 - df_movie['year']

# Normaliser l'âge pour obtenir le taux de jeunesse (0 à 1)
# Ici, l'âge le plus vieux devient 0 et l'âge le plus jeune devient 1
df_movie['movie_youth_rate'] = 1 - (df_movie['age'] - df_movie['age'].min()) / (df_movie['age'].max() - df_movie['age'].min())

df_movie['movie_youth_rate'] = df_movie['movie_youth_rate'].astype('float32')

# Droper la colonne 'age'
df_movie = df_movie.drop(columns=['age', 'year', '(no genres listed)'], axis=1)

df_movie.head(5)

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_youth_rate
0,1,Toy Story,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.83871
1,2,Jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.83871
2,3,Grumpier Old Men,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0.83871
3,4,Waiting to Exhale,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0.83871
4,5,Father of the Bride Part II,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.83871


In [11]:
# Importing necessary libraries
import nltk
from nltk.data import find
import gensim

# Downloading required NLTK resources
nltk.download('punkt')  # Downloading tokenizers for NLTK
nltk.download('stopwords')
nltk.download('word2vec_sample')  # Downloading the word2vec sample model

# Finding the path of the pre-trained word2vec model
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

# Loading the pre-trained word2vec model using Gensim
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Assure-toi d'avoir téléchargé les stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Charger les stopwords anglais
stop_words = set(stopwords.words('english'))

# Fonction pour supprimer les stopwords d'un texte
def remove_stopwords(text):
    # Tokenisation du texte
    words = word_tokenize(text.lower())
    # Filtrage des stopwords
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]
    # Rejoindre les mots filtrés en une seule chaîne
    return ' '.join(filtered_words)

# Appliquer la fonction à la colonne 'title'
df_movie['title'] = df_movie['title'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
'''# Fonction pour vectoriser les mots d'un title
def vectorize_title(title, model):
    vectors = []
    for word in title.split():
        if word in model:
            vectors.append(model[word])
        else:
            return np.zeros(model.vector_size)  # Retourne un vecteur nul si un mot n'est pas reconnu
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


# Appliquer la fonction de vectorisation
df_movie['title_vector'] = df_movie['title'].apply(lambda x: vectorize_title(x, model))


df_movie.head(3)'''

"# Fonction pour vectoriser les mots d'un title\ndef vectorize_title(title, model):\n    vectors = []\n    for word in title.split():\n        if word in model:\n            vectors.append(model[word])\n        else:\n            return np.zeros(model.vector_size)  # Retourne un vecteur nul si un mot n'est pas reconnu\n    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)\n\n\n# Appliquer la fonction de vectorisation\ndf_movie['title_vector'] = df_movie['title'].apply(lambda x: vectorize_title(x, model))\n\n\ndf_movie.head(3)"

In [None]:
df_imdb = pd.read_csv('output_data\imdb_encoded_3.csv')

In [None]:
df_imdb = df_imdb[['movieId', 'popularity_rate']]

In [None]:
df_movie = pd.merge(df_movie, df_imdb, on='movieId', how='left')

df_movie.head(5)

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_youth_rate,popularity_rate
0,1,toy story,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0.83871,0.369477
1,2,jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.83871,0.130576
2,3,grumpier old men,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0.83871,0.01023
3,4,waiting exhale,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0.83871,0.004228
4,5,father bride part ii,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0.83871,0.01438


In [None]:
print(df_movie.columns)

Index(['movieId', 'title', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western', 'movie_youth_rate', 'popularity_rate'],
      dtype='object')


In [None]:
ordered_columns = ['movieId', 'title', 'movie_youth_rate', 'popularity_rate', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western']

df_movie = df_movie[ordered_columns]

df_movie.head(2)


Unnamed: 0,movieId,title,movie_youth_rate,popularity_rate,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story,0.83871,0.369477,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,jumanji,0.83871,0.130576,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_combined = pd.merge(df, df_movie, on='movieId', how='left')

df_combined.head(5)

Unnamed: 0,userId,movieId,rating,rating_youth_rate,title,movie_youth_rate,popularity_rate,Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2,3.5,0.52381,jumanji,0.83871,0.130576,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,29,3.5,0.52381,city lost children cité des enfants perdus la,0.83871,0.024644,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1,32,3.5,0.52381,twelve monkeys monkeys,0.83871,0.222506,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,1,47,3.5,0.52381,seven,0.83871,0.623154,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1,50,3.5,0.52381,usual suspects,0.83871,0.394856,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [20]:
df_combined['rating'] = df_combined['rating'].astype('float32')
df_combined['rating_youth_rate'] = df_combined['rating_youth_rate'].astype('float32')
df_combined['movie_youth_rate'] = df_combined['movie_youth_rate'].astype('float32')
df_combined['popularity_rate'] = df_combined['popularity_rate'].astype('float32')


In [21]:
df_combined.shape

(20000263, 26)