In [10]:
# Importing necessary libraries
import nltk
from nltk.data import find
import gensim

# Downloading required NLTK resources
nltk.download('punkt')  # Downloading tokenizers for NLTK
nltk.download('word2vec_sample')  # Downloading the word2vec sample model

# Finding the path of the pre-trained word2vec model
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

# Loading the pre-trained word2vec model using Gensim
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package word2vec_sample to
[nltk_data]     C:\Users\jcrig\AppData\Roaming\nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [11]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv('output_data\\384k_tags_wo_names.csv')

In [13]:
#df.drop('Unnamed: 0', axis=1, inplace=True)
df.head(5)

Unnamed: 0,userId,movieId,tag,age
0,65,208,dark hero,11.0
1,65,353,dark hero,11.0
2,65,521,noir thriller,11.0
3,65,592,dark hero,11.0
4,65,668,bollywood,11.0


In [14]:
len(df)

384120

In [15]:
# Fonction pour vectoriser les mots d'une tag
def vectorize_tag(tag, model):
    vectors = []
    for word in tag.split():
        if word in model:
            vectors.append(model[word])
        else:
            return np.nan  # Retourne NaN si un mot n'est pas reconnu
    return np.sum(vectors, axis=0)  # Somme des vecteurs pour chaque tag

# Appliquer la fonction de vectorisation
df['vector'] = df['tag'].apply(lambda x: vectorize_tag(x, model))

# Supprimer les lignes où le vector est NaN (c'est-à-dire les lignes avec des mots non reconnus)
df.dropna(subset=['vector'], inplace=True)
df.head(5)

Unnamed: 0,userId,movieId,tag,age,vector
0,65,208,dark hero,11.0,"[0.1728352, 0.110455096, 0.1315891, -0.0210759..."
1,65,353,dark hero,11.0,"[0.1728352, 0.110455096, 0.1315891, -0.0210759..."
3,65,592,dark hero,11.0,"[0.1728352, 0.110455096, 0.1315891, -0.0210759..."
5,65,898,screwball comedy,11.0,"[0.059081092, -0.037012, 0.030395936, 0.240585..."
7,65,1391,mars,11.0,"[0.00196312, 0.0980603, 0.0234936, -0.0271964,..."


In [19]:
model["mars"][:8]
#le modele ne comprend pas les noms propres il faut eliminer ces mots, c'est du bruit


array([ 0.00196312,  0.0980603 ,  0.0234936 , -0.0271964 ,  0.0145558 ,
        0.066395  ,  0.112361  , -0.0595001 ], dtype=float32)

In [17]:
df.shape

(237015, 5)

In [20]:
df_genome_tags = pd.read_csv('input_data\genome_tags.csv')
df_genome_tags.head(5)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [21]:
# Fonction pour vectoriser les mots d'une tag
def vectorize_tag(tag, model):
    vectors = []
    for word in tag.split():
        if word in model:
            vectors.append(model[word])
        else:
            return np.nan  # Retourne NaN si un mot n'est pas reconnu
    return np.sum(vectors, axis=0)  # Somme des vecteurs pour chaque tag

# Appliquer la fonction de vectorisation
df_genome_tags['vector'] = df_genome_tags['tag'].apply(lambda x: vectorize_tag(x, model))

# Supprimer les lignes où le vector est NaN (c'est-à-dire les lignes avec des mots non reconnus)
df_genome_tags.dropna(subset=['vector'], inplace=True)
df_genome_tags.head(5)

Unnamed: 0,tagId,tag,vector
16,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ..."
17,18,absurd,"[0.0468222, -0.050771, 0.0409929, 0.0389245, -..."
18,19,action,"[-0.0138909, 0.0633263, 0.0185893, 0.0105203, ..."
19,20,action packed,"[0.0420461, 0.0945427, -0.0084770005, 0.010103..."
20,21,adaptation,"[0.0837244, -0.0257734, -0.0665421, 0.0565452,..."


In [22]:
df_genome_scores = pd.read_csv('input_data\genome_scores.csv')
df_genome_scores.head(5)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [23]:
df_tags_merged = pd.merge(left=df_genome_tags, right=df_genome_scores, how='inner', on='tagId')
df_tags_merged.head(5)

Unnamed: 0,tagId,tag,vector,movieId,relevance
0,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",1,0.007
1,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",2,0.0095
2,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",3,0.01175
3,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",4,0.014
4,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",5,0.01425


In [25]:
len(df_tags_merged)

7640416

In [26]:
df_tags_merged = df_tags_merged[df_tags_merged['relevance']>0.5]
len(df_tags_merged)

365316

In [27]:
# Fonction pour multiplier chaque vecteur par la relevance
def apply_relevance(row):
    weighted_vector = np.array(row['vector']) * row['relevance']
    return weighted_vector

# Appliquer la fonction à chaque ligne
df_tags_merged['weighted_vector'] = df_tags_merged.apply(apply_relevance, axis=1)
df_tags_merged.head(5)

Unnamed: 0,tagId,tag,vector,movieId,relevance,weighted_vector
1243,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",1392,0.97325,"[0.0075675636, 0.02316841, 0.06519743, 0.08661..."
2798,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",3148,0.85875,"[0.006677262, 0.020442715, 0.057527147, 0.0764..."
2849,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",3210,0.6765,"[0.0052601667, 0.016104218, 0.04531833, 0.0602..."
3430,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",3888,0.63675,"[0.0049510878, 0.01515796, 0.042655498, 0.0566..."
3703,17,abortion,"[0.00777556, 0.0238052, 0.0669894, 0.0890002, ...",4191,0.745,"[0.0057927924, 0.017734874, 0.049907103, 0.066..."


In [28]:
len(df_tags_merged)

365316

In [31]:
df_tags_merged.drop(['vector', 'relevance'], axis=1, inplace=True)

In [32]:
df_tags_merged.head()

Unnamed: 0,tagId,tag,movieId,weighted_vector
1243,17,abortion,1392,"[0.0075675636, 0.02316841, 0.06519743, 0.08661..."
2798,17,abortion,3148,"[0.006677262, 0.020442715, 0.057527147, 0.0764..."
2849,17,abortion,3210,"[0.0052601667, 0.016104218, 0.04531833, 0.0602..."
3430,17,abortion,3888,"[0.0049510878, 0.01515796, 0.042655498, 0.0566..."
3703,17,abortion,4191,"[0.0057927924, 0.017734874, 0.049907103, 0.066..."


In [37]:
# Grouper par 'movieId' et calculer la moyenne des 'weighted_vector'
df_avg_vector = df_tags_merged.groupby('movieId')['weighted_vector'].apply(lambda x: np.mean(np.stack(x), axis=0))

# Convertir le résultat en DataFrame si nécessaire
df_avg_vector = df_avg_vector.reset_index()

df_avg_vector.head(5)

Unnamed: 0,movieId,weighted_vector
0,1,"[0.034586858, 0.0041892263, -0.007967652, 0.04..."
1,2,"[0.02394018, 0.012072621, -0.012650338, 0.0410..."
2,3,"[0.025589697, -0.0041956874, -0.0049643833, 0...."
3,4,"[0.014996908, -0.0033440643, -0.023636833, 0.0..."
4,5,"[0.023046032, -0.010654649, -0.01876496, 0.041..."


In [38]:
len(df_avg_vector)

10381

In [35]:
# Export du dataframe au format csv
df_avg_vector.to_csv('output_data/movie_avg_vectors.csv',  index=False)