In [1]:
import numpy as np
import pandas as pd
import re


In [2]:
df_tag = pd.read_csv('input_data\\tag.csv', delimiter=',')
df_tag.head(4)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43


In [3]:
df_tag.shape

(465564, 4)

In [4]:
# Find columns with NaN values
# Count NaN values for each column
nan_counts = df_tag.isna().sum()

# Filter and print only the columns with NaN values and their counts
nan_columns_counts = nan_counts[nan_counts > 0]
nan_columns_counts

tag    16
dtype: int64

In [5]:
df_tag = df_tag.dropna()


In [6]:
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [7]:
from nltk.corpus import names

# Charger les prénoms masculins et féminins
male_names = set(names.words('male.txt'))
female_names = set(names.words('female.txt'))

# Fonction pour remplacer toute la cellule par "male actor" ou "female actor"
def replace_name(tag):
    words = tag.split()
    for word in words:
        if word in male_names:
            return "actor"
        elif word in female_names:
            return "actress"
    return tag  # Si aucun prénom n'est trouvé, conserver le tag original

# Appliquer la fonction à la colonne 'tag' pour remplacer les prénoms
df_tag['tag'] = df_tag['tag'].apply(replace_name)

df_tag.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,actor,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [8]:
df_tag.dtypes


userId        int64
movieId       int64
tag          object
timestamp    object
dtype: object

In [9]:
# Convertir la colonne 'timestamp' en datetime si nécessaire
df_tag['timestamp'] = pd.to_datetime(df_tag['timestamp'])

# Calculer l'âge en années
current_date = pd.Timestamp.now()
df_tag['age'] = round((current_date - df_tag['timestamp']).dt.days / 365.25, 0)

# Droper la colonne 'timestamp'
df_tag = df_tag.drop(columns=['timestamp'])


In [10]:
df_tag['age'] = round(df_tag['age'],0)
df_tag

Unnamed: 0,userId,movieId,tag,age
0,18,4141,actor,15.0
1,65,208,dark hero,11.0
2,65,353,dark hero,11.0
3,65,521,noir thriller,11.0
4,65,592,dark hero,11.0
...,...,...,...,...
465559,138446,55999,dragged,12.0
465560,138446,55999,actor,12.0
465561,138446,55999,quirky,12.0
465562,138446,55999,sad,12.0


In [11]:
df_movie = pd.read_csv('input_data\movie.csv')

df_movie.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [12]:
# Séparer les genres en une liste, puis joindre avec un espace
df_movie['genres'] = df_movie['genres'].str.split('|').str.join(' ')

# Utiliser str.extract pour séparer le titre et l'année
df_movie[['title', 'year']] = df_movie['title'].str.extract(r'^(.*)\s\((\d{4})\)$')

df_movie.head(3)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995
1,2,Jumanji,Adventure Children Fantasy,1995
2,3,Grumpier Old Men,Comedy Romance,1995


In [13]:
# Drop les lignes sans titles ni years (55 en tout)
df_movie = df_movie.dropna(subset=['title'])

# Convertir la colonne 'year' en entier
df_movie['year'] = df_movie['year'].astype(int)

# Creation de la colonne age_movie
df_movie['age_movie'] = 2024 - df_movie['year']

# Droper la colonne 'year'
df_movie = df_movie.drop(columns=['year'])

df_movie.head(5)

Unnamed: 0,movieId,title,genres,age_movie
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,29
1,2,Jumanji,Adventure Children Fantasy,29
2,3,Grumpier Old Men,Comedy Romance,29
3,4,Waiting to Exhale,Comedy Drama Romance,29
4,5,Father of the Bride Part II,Comedy,29


In [14]:
df_tag_title_genres = df_tag.merge(df_movie, on='movieId', how='left')

df_tag_title_genres.head(5)

Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie
0,18,4141,actor,15.0,Head Over Heels,Comedy Romance,23.0
1,65,208,dark hero,11.0,Waterworld,Action Adventure Sci-Fi,29.0
2,65,353,dark hero,11.0,"Crow, The",Action Crime Fantasy Thriller,30.0
3,65,521,noir thriller,11.0,Romeo Is Bleeding,Crime Thriller,31.0
4,65,592,dark hero,11.0,Batman,Action Crime Thriller,35.0


In [15]:
df_tag_title_genres.shape

(465548, 7)

In [16]:
df_tag_title_genres['title'] = df_tag_title_genres['title'].astype(str)
df_tag_title_genres['genres'] = df_tag_title_genres['genres'].astype(str)

In [17]:
# Importing necessary libraries
import nltk
from nltk.data import find
import gensim

# Downloading required NLTK resources
nltk.download('punkt')  # Downloading tokenizers for NLTK
nltk.download('stopwords')
nltk.download('word2vec_sample')  # Downloading the word2vec sample model

# Finding the path of the pre-trained word2vec model
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

# Loading the pre-trained word2vec model using Gensim
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [18]:
model.similarity('actor','actress')

0.79300094

In [19]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Assure-toi d'avoir téléchargé les stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Charger les stopwords anglais
stop_words = set(stopwords.words('english'))

# Fonction pour supprimer les stopwords d'un texte
def remove_stopwords(text):
    # Tokenisation du texte
    words = word_tokenize(text.lower())
    # Filtrage des stopwords
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]
    # Rejoindre les mots filtrés en une seule chaîne
    return ' '.join(filtered_words)

# Appliquer la fonction à la colonne 'title'
df_tag_title_genres['title'] = df_tag_title_genres['title'].apply(remove_stopwords)
df_tag_title_genres['tag'] = df_tag_title_genres['tag'].apply(remove_stopwords)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def clean_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer la ponctuation et les symboles
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Appliquer la fonction aux colonnes 'tag', 'title', et 'genres'
df_tag_title_genres['tag'] = df_tag_title_genres['tag'].apply(clean_text)
df_tag_title_genres['title'] = df_tag_title_genres['title'].apply(clean_text)
df_tag_title_genres['genres'] = df_tag_title_genres['genres'].apply(clean_text)

df_tag_title_genres.head(10)

Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie
0,18,4141,actor,15.0,head heels,comedy romance,23.0
1,65,208,dark hero,11.0,waterworld,action adventure scifi,29.0
2,65,353,dark hero,11.0,crow,action crime fantasy thriller,30.0
3,65,521,noir thriller,11.0,romeo bleeding,crime thriller,31.0
4,65,592,dark hero,11.0,batman,action crime thriller,35.0
5,65,668,bollywood,11.0,song little road pather panchali,drama,69.0
6,65,898,screwball comedy,11.0,philadelphia story,comedy drama romance,84.0
7,65,1248,noir thriller,11.0,touch evil,crime filmnoir thriller,66.0
8,65,1391,mars,11.0,mars attacks,action comedy scifi,28.0
9,65,1617,,11.0,confidential,crime filmnoir mystery thriller,27.0


In [21]:
# Dictionnaire de mapping pour remplacer les termes
mapping_dict = {
    'scifi': 'future',
    'thriller': 'suspense',
    'filmnoir': 'cynical',
    'musical': 'singing',
    'western': 'cowboy'
}

# Remplacement des termes dans la colonne 'genres'
for key, value in mapping_dict.items():
    df_tag_title_genres['genres'] = df_tag_title_genres['genres'].str.replace(key, value, regex=True)

In [22]:
df_tag_title_genres.head(10)

Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie
0,18,4141,actor,15.0,head heels,comedy romance,23.0
1,65,208,dark hero,11.0,waterworld,action adventure future,29.0
2,65,353,dark hero,11.0,crow,action crime fantasy suspense,30.0
3,65,521,noir thriller,11.0,romeo bleeding,crime suspense,31.0
4,65,592,dark hero,11.0,batman,action crime suspense,35.0
5,65,668,bollywood,11.0,song little road pather panchali,drama,69.0
6,65,898,screwball comedy,11.0,philadelphia story,comedy drama romance,84.0
7,65,1248,noir thriller,11.0,touch evil,crime cynical suspense,66.0
8,65,1391,mars,11.0,mars attacks,action comedy future,28.0
9,65,1617,,11.0,confidential,crime cynical mystery suspense,27.0


In [23]:
# Fonction pour vectoriser les mots d'une tag
def vectorize_tag(tag, model):
    vectors = []
    for word in tag.split():
        if word in model:
            vectors.append(model[word])
        else:
            return np.nan  # Retourne NaN si un mot n'est pas reconnu
    return np.mean(vectors, axis=0)  # Somme des vecteurs pour chaque tag

# Appliquer la fonction de vectorisation
df_tag_title_genres['tag_vector'] = df_tag_title_genres['tag'].apply(lambda x: vectorize_tag(x, model))
df_tag_title_genres['title_vector'] = df_tag_title_genres['title'].apply(lambda x: vectorize_tag(x, model))
df_tag_title_genres['genres_vector'] = df_tag_title_genres['genres'].apply(lambda x: vectorize_tag(x, model))


df_tag_title_genres.head(3)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie,tag_vector,title_vector,genres_vector
0,18,4141,actor,15.0,head heels,comedy romance,23.0,"[0.0536976, -0.0352089, -0.0556269, 0.0234726,...","[-0.04193265, -0.02813775, 0.05097, -0.0277740...","[0.032902144, -0.0385218, -0.03976148, 0.07610..."
1,65,208,dark hero,11.0,waterworld,action adventure future,29.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...",,"[0.0060361014, 0.017076494, 0.011771732, 0.026..."
2,65,353,dark hero,11.0,crow,action crime fantasy suspense,30.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...","[-0.00227776, 0.0247481, 0.0133911, 0.00635654...","[0.041928604, 0.0007887692, 0.025844725, -0.00..."


In [24]:
# Créer un vecteur de 300 éléments égaux à zéro
zero_vector = np.zeros(300)

# Remplacer les NaN par le vecteur zéro dans chaque colonne
df_tag_title_genres['tag_vector'] = df_tag_title_genres['tag_vector'].apply(lambda x: zero_vector if isinstance(x, float) and np.isnan(x) else x)
df_tag_title_genres['title_vector'] = df_tag_title_genres['title_vector'].apply(lambda x: zero_vector if isinstance(x, float) and np.isnan(x) else x)
df_tag_title_genres['genres_vector'] = df_tag_title_genres['genres_vector'].apply(lambda x: zero_vector if isinstance(x, float) and np.isnan(x) else x)
df_tag_title_genres.head(3)

Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie,tag_vector,title_vector,genres_vector
0,18,4141,actor,15.0,head heels,comedy romance,23.0,"[0.0536976, -0.0352089, -0.0556269, 0.0234726,...","[-0.04193265, -0.02813775, 0.05097, -0.0277740...","[0.032902144, -0.0385218, -0.03976148, 0.07610..."
1,65,208,dark hero,11.0,waterworld,action adventure future,29.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0060361014, 0.017076494, 0.011771732, 0.026..."
2,65,353,dark hero,11.0,crow,action crime fantasy suspense,30.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...","[-0.00227776, 0.0247481, 0.0133911, 0.00635654...","[0.041928604, 0.0007887692, 0.025844725, -0.00..."


In [25]:
# Fonction pour calculer la moyenne des vecteurs
def calculate_average_vector(row):
    vectors = np.array([row['tag_vector'], row['title_vector'], row['genres_vector']])
    return np.mean(vectors, axis=0)

# Appliquer la fonction pour calculer le vecteur moyen et l'ajouter comme nouvelle colonne
df_tag_title_genres['user_movie_vector'] = df_tag_title_genres.apply(calculate_average_vector, axis=1)
df_tag_title_genres.head(3)


Unnamed: 0,userId,movieId,tag,age,title,genres,age_movie,tag_vector,title_vector,genres_vector,user_movie_vector
0,18,4141,actor,15.0,head heels,comedy romance,23.0,"[0.0536976, -0.0352089, -0.0556269, 0.0234726,...","[-0.04193265, -0.02813775, 0.05097, -0.0277740...","[0.032902144, -0.0385218, -0.03976148, 0.07610...","[0.014889032, -0.03395615, -0.014806126, 0.023..."
1,65,208,dark hero,11.0,waterworld,action adventure future,29.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0060361014, 0.017076494, 0.011771732, 0.026...","[0.03081790062909325, 0.02410134735206763, 0.0..."
2,65,353,dark hero,11.0,crow,action crime fantasy suspense,30.0,"[0.0864176, 0.055227548, 0.06579455, -0.010537...","[-0.00227776, 0.0247481, 0.0133911, 0.00635654...","[0.041928604, 0.0007887692, 0.025844725, -0.00...","[0.042022813, 0.026921473, 0.035010125, -0.001..."
