# 1. Importations et initialisations nécessaires

In [1]:
import zipfile
import pandas as pd
import gensim, logging
from tqdm.notebook import tqdm
from utils.read_data import getDF
from utils.useful_functions import PATH_DATA
from utils.useful_functions import PATH_METADATA
from utils.useful_functions import clear_description

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Configuration de la journalisation 

# 2. Récupération et visualisation des données

In [2]:
# Décompression du fichier dataset.zip
with zipfile.ZipFile(PATH_DATA, 'r') as data:
    data.extractall()

In [3]:
# Lecture des données du fichier metadata.json
data = getDF(PATH_METADATA)

In [4]:
# Visualisation des 5 premières lignes du DataFrame data
display(
    data.head(5).style.format("{0}").set_caption("5 premières lignes du DataFrame data après chargement des données : Dimension = " +
                                                          str(data.shape)
                                                         )
)

Unnamed: 0,ID,title,slug,category,imPath
0,7541,Christina Gavioli,christina-gavioli-3,"['Fashion Women', 'Women Blouse and Dress']",images/Fashion Women/Women Blouse and Dress/CHRISTINA_GAVIOLI.jpg
1,7540,Sexy Woman,sexy-woman-3,"['Fashion Women', 'Women Blouse and Dress']",images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_MULTICOLORE.jpg
2,7539,Sexy Woman,sexy-woman-2,"['Fashion Women', 'Women Blouse and Dress']",images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_JAUNE.jpg
3,7538,Christina Gavioli,christina-gavioli-2,"['Fashion Women', 'Women Blouse and Dress']",images/Fashion Women/Women Blouse and Dress/christina_gavioli_bordeau.jpg
4,7537,Christina Gavioli,christina-gavioli,"['Fashion Women', 'Women Blouse and Dress']",images/Fashion Women/Women Blouse and Dress/christina_gavioli_blc_maron.jpg


# 3. Prétraitement et visualisation des données

In [5]:
# Supression des colonnes: slug et category du Dataframe data
del data['slug']
del data['category']

In [6]:
# Localisation horizontale des valeurs manquantes dans le DataFrame data
pd.isnull(data).any(axis=0)

ID        False
title     False
imPath    False
dtype: bool

In [7]:
# Les titres des produits sont utilisés comme descriptions des produits
# Prétraitement des descriptions des produits, nous entendons par là :
# - Conversion des descriptions en minuscule
# - Suppression des caractères de ponctuation
# - Tokénisation
# - Suppression des mots vides
data['tokens_list'] = data['title'].apply(lambda x : clear_description(x))

In [9]:
# Visualisation des 5 premières lignes du DataFrame data
display(
    data.head(5).style.format("{0}").set_caption("5 premières lignes du DataFrame data prétraité : Dimension = " +
                                                          str(data.shape)
                                                         )
)

Unnamed: 0,ID,title,imPath,tokens_list
0,7541,Christina Gavioli,images/Fashion Women/Women Blouse and Dress/CHRISTINA_GAVIOLI.jpg,"['christina', 'gavioli']"
1,7540,Sexy Woman,images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_MULTICOLORE.jpg,"['sexy', 'woman']"
2,7539,Sexy Woman,images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_JAUNE.jpg,"['sexy', 'woman']"
3,7538,Christina Gavioli,images/Fashion Women/Women Blouse and Dress/christina_gavioli_bordeau.jpg,"['christina', 'gavioli']"
4,7537,Christina Gavioli,images/Fashion Women/Women Blouse and Dress/christina_gavioli_blc_maron.jpg,"['christina', 'gavioli']"


# 4. Apprentissage des words embeddings via le modèle Skip-Gram

In [10]:
# Initialisation et entrainement du modèle
sentences = list(data['tokens_list'])
model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=4, sg=1, epochs=10)

2022-04-03 08:00:07,267 : INFO : collecting all words and their counts
2022-04-03 08:00:07,277 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-04-03 08:00:07,295 : INFO : collected 1468 word types from a corpus of 7758 raw words and 1655 sentences
2022-04-03 08:00:07,303 : INFO : Creating a fresh vocabulary
2022-04-03 08:00:07,363 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 1468 unique words (100.0%% of original 1468, drops 0)', 'datetime': '2022-04-03T08:00:07.363459', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'prepare_vocab'}
2022-04-03 08:00:07,378 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 7758 word corpus (100.0%% of original 7758, drops 0)', 'datetime': '2022-04-03T08:00:07.378449', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platf