# Create vectors for the images in the dataset

In [1]:
import pandas as pd

df = pd.read_csv('./data/feature_vectors_database_ai-preprod.gz', compression='gzip', sep=';')
df = pd.DataFrame(df,  columns=[col for col in df.columns])

In [2]:
df.head()

Unnamed: 0,product_id,season_year,brand_label,gender_label,image_url,sports_id_list,dsm_code,color_label,nature_id,sports_labels_list,product_label,nature_label,gender_id,image_id,country_code,active
0,8612301,2020.0,DITA,KIDS,https://contents.mediadecathlon.com/p1855779/s...,169,X8612301,LIGHT PINK,576630.0,field hockey,STICK DITA MEGATEC ENFANT PINK,HOCKEY STICK,4.0,p1855779,old,active
1,018dc1e4-9d0f-4548-baa9-d8b62549fa85,,,,https://contents.mediadecathlon.com/m11391253/...,,,,,,,,,m11391253,marketplace,inactive
2,8227830,2012.0,WEDZE,WOMEN'S,https://contents.mediadecathlon.com/p629818/sq...,232|233|234,X8227830,N/D,296992.0,alpine skiing|freeride skiing|freestyle skiing,FLOWFIT L SL2 MADRAS LIME 12,BASE LAYER,3.0,p629818,old,active
3,8170111,2011.0,VMC,NO GENDER,https://contents.mediadecathlon.com/p823162/sq...,99,X8170111,N/D,500384.0,athletics,HOOK 7356 BN,HOOKS,13.0,p823162,old,active
4,f692a4e1-4bb7-4cf2-992a-7a0cf2c090fe,,,,https://contents.mediadecathlon.com/m7952071/k...,,,,,,,,,m7952071,marketplace,inactive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439405 entries, 0 to 439404
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   product_id          439405 non-null  object 
 1   season_year         223493 non-null  object 
 2   brand_label         219360 non-null  object 
 3   gender_label        223483 non-null  object 
 4   image_url           439405 non-null  object 
 5   sports_id_list      223493 non-null  object 
 6   dsm_code            223493 non-null  object 
 7   color_label         223493 non-null  object 
 8   nature_id           223493 non-null  float64
 9   sports_labels_list  223493 non-null  object 
 10  product_label       219793 non-null  object 
 11  nature_label        223493 non-null  object 
 12  gender_id           223483 non-null  float64
 13  image_id            439405 non-null  object 
 14  country_code        439405 non-null  object 
 15  active              438495 non-nul

In [4]:
import requests

# Get images

image_urls = df["image_url"].values
image_file_names = []

for i, url in enumerate(image_urls[:10]):
    img_data = requests.get(url).content
    image_file_names.append(f'./data/images/{i}.jpg')
    with open(f'./data/images/{i}.jpg', 'wb') as handler:
        handler.write(img_data)
    print(f'Saving image: ./data/images/{i}.jpg from {url}')

Saving image: ./data/images/0.jpg from https://contents.mediadecathlon.com/p1855779/sq/1855779.jpg?f=224x224
Saving image: ./data/images/1.jpg from https://contents.mediadecathlon.com/m11391253/k$148c4677adee0722821c354c34d12dc5/sq/Chaussures+de+running+Homme+Catamount+Brooks.jpg?f=224x0
Saving image: ./data/images/2.jpg from https://contents.mediadecathlon.com/p629818/sq/629818.jpg?f=224x224
Saving image: ./data/images/3.jpg from https://contents.mediadecathlon.com/p823162/sq/823162.jpg?f=224x224
Saving image: ./data/images/4.jpg from https://contents.mediadecathlon.com/m7952071/k$e6a785deba663f6196c073e6ef27c8a9/sq/Gants+courts+100+Sling.jpg?f=224x0
Saving image: ./data/images/5.jpg from https://contents.mediadecathlon.com/m1392352/k$fec89a310b5c6cb22bd09e66604b7847/sq/Laguna+28+Roller+glaci+re+sur+roues+bleu+pour+camping+et+randonn+e+26+Litres.jpg?f=224x0
Saving image: ./data/images/6.jpg from https://contents.mediadecathlon.com/m6899674/k$8dccf32081e8ef505f900880e729e15d/sq/Discada

### Load CLIP Encoder for feature extraction

In [5]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def to_numpy(tensor):
    return (
        tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    )

  from .autonotebook import tqdm as notebook_tqdm


### Get feature vectors

In [6]:
imgs_fv = []

for img_file in image_file_names:
    image = preprocess(Image.open(img_file)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features_np = to_numpy(image_features).tolist()[0]
        img_fv = [fv for fv in image_features_np]
        imgs_fv.append(img_fv)

### Make small dataset with vectors

In [7]:
# take 10 objects and create new CSV
df_with_fv = df.copy()[:10]
# save results in new column
df_with_fv['vector'] = imgs_fv

In [8]:
df_with_fv.head() # new column named vector

Unnamed: 0,product_id,season_year,brand_label,gender_label,image_url,sports_id_list,dsm_code,color_label,nature_id,sports_labels_list,product_label,nature_label,gender_id,image_id,country_code,active,vector
0,8612301,2020.0,DITA,KIDS,https://contents.mediadecathlon.com/p1855779/s...,169,X8612301,LIGHT PINK,576630.0,field hockey,STICK DITA MEGATEC ENFANT PINK,HOCKEY STICK,4.0,p1855779,old,active,"[-0.01212814450263977, 0.32004088163375854, 0...."
1,018dc1e4-9d0f-4548-baa9-d8b62549fa85,,,,https://contents.mediadecathlon.com/m11391253/...,,,,,,,,,m11391253,marketplace,inactive,"[-0.05312139168381691, 0.016751728951931, -0.0..."
2,8227830,2012.0,WEDZE,WOMEN'S,https://contents.mediadecathlon.com/p629818/sq...,232|233|234,X8227830,N/D,296992.0,alpine skiing|freeride skiing|freestyle skiing,FLOWFIT L SL2 MADRAS LIME 12,BASE LAYER,3.0,p629818,old,active,"[0.2695339024066925, 0.090487040579319, 0.0400..."
3,8170111,2011.0,VMC,NO GENDER,https://contents.mediadecathlon.com/p823162/sq...,99,X8170111,N/D,500384.0,athletics,HOOK 7356 BN,HOOKS,13.0,p823162,old,active,"[0.25236183404922485, -0.6132733225822449, 0.3..."
4,f692a4e1-4bb7-4cf2-992a-7a0cf2c090fe,,,,https://contents.mediadecathlon.com/m7952071/k...,,,,,,,,,m7952071,marketplace,inactive,"[0.09365296363830566, 0.2518293261528015, 0.03..."


In [9]:
df_with_fv.to_csv('./data/vectors.csv', sep=';')

In [10]:
import pandas as pd

df_new = pd.read_csv('./data/vectors.csv', sep=';', index_col=0)
df_new.head()

Unnamed: 0,product_id,season_year,brand_label,gender_label,image_url,sports_id_list,dsm_code,color_label,nature_id,sports_labels_list,product_label,nature_label,gender_id,image_id,country_code,active,vector
0,8612301,2020.0,DITA,KIDS,https://contents.mediadecathlon.com/p1855779/s...,169,X8612301,LIGHT PINK,576630.0,field hockey,STICK DITA MEGATEC ENFANT PINK,HOCKEY STICK,4.0,p1855779,old,active,"[-0.01212814450263977, 0.32004088163375854, 0...."
1,018dc1e4-9d0f-4548-baa9-d8b62549fa85,,,,https://contents.mediadecathlon.com/m11391253/...,,,,,,,,,m11391253,marketplace,inactive,"[-0.05312139168381691, 0.016751728951931, -0.0..."
2,8227830,2012.0,WEDZE,WOMEN'S,https://contents.mediadecathlon.com/p629818/sq...,232|233|234,X8227830,N/D,296992.0,alpine skiing|freeride skiing|freestyle skiing,FLOWFIT L SL2 MADRAS LIME 12,BASE LAYER,3.0,p629818,old,active,"[0.2695339024066925, 0.090487040579319, 0.0400..."
3,8170111,2011.0,VMC,NO GENDER,https://contents.mediadecathlon.com/p823162/sq...,99,X8170111,N/D,500384.0,athletics,HOOK 7356 BN,HOOKS,13.0,p823162,old,active,"[0.25236183404922485, -0.6132733225822449, 0.3..."
4,f692a4e1-4bb7-4cf2-992a-7a0cf2c090fe,,,,https://contents.mediadecathlon.com/m7952071/k...,,,,,,,,,m7952071,marketplace,inactive,"[0.09365296363830566, 0.2518293261528015, 0.03..."
