In [1]:
import json
import pandas as pd

ratings = pd.read_csv('../data/train.csv')
item_metadata = pd.read_csv('../data/item_metadata.csv')
test_users = pd.read_csv('../data/test.csv')["user_id"]
mapping = json.load(open('../data/id_mappings.json'))

relevant_items = set(mapping['item_mapping'].keys())
item_metadata = item_metadata.query("parent_asin in @relevant_items").reset_index(drop=True)

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ratings['item_id_enc'] = le.fit_transform(ratings['item_id'])

In [None]:
from PIL import Image
from transformers import AutoProcessor, AutoModel
import torch

# model = AutoModel.from_pretrained("google/siglip-so400m-patch14-224", device_map="cuda" )
# processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-224", device_map="cuda")

model = AutoModel.from_pretrained("google/siglip-base-patch16-224", device_map="cuda")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224", device_map="cuda")

In [None]:
import os
from tqdm import tqdm

dir = "/home/nekoneko/ML_challenge/ds_club/data/images"

imgs = os.listdir(dir)
img_list = [f"{dir}/{img_name}" for img_name in imgs]

all_features = []
all_names = []

for i in tqdm(range(0, len(img_list), 100)):
    batch_paths = img_list[i:i+100]
    batch_images = []
    valid_paths = []
    
    for path in batch_paths:
        try:
            batch_images.append(Image.open(path).convert('RGB'))
            valid_paths.append(path)
        except:
            continue
    
    if batch_images:
        inputs = processor(text=None, images=batch_images, padding="max_length", return_tensors="pt").to("cuda")
        
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            all_features.append(outputs.cpu())
            all_names.extend([os.path.basename(path) for path in valid_paths])

# final_features = torch.cat(all_features, dim=0)

In [None]:
torch.save({
    'features': all_features,
    'names': all_names
}, 'image_features_siglip_base.pt')

features_flat = []
for f in all_features:
    features_flat += torch.unbind(f)

item_to_feature = {}
for n, feat in zip(all_names, features_flat):
    n = n.split('.')[0]
    if n in mapping['item_mapping']:
        item_to_feature[mapping['item_mapping'][n]] = feat

torch.save(item_to_feature, "item_to_siglip_feature.dict")

In [None]:
embs_path = "/home/nekoneko/ML_challenge/ds_club/siglip/item_to_siglip_feature.dict"
item_to_feature = torch.load(embs_path)

In [20]:
bueno = []
nonloso = []
for item in item_to_feature.keys():
    try:
        bueno.append(le.transform([int(item)]))
    except:
        nonloso.append(item)

In [42]:
features_orderd = []
missing_items = []
for i in range(len(le.classes_)):
    idx = le.inverse_transform([i])[0]
    
    if idx in item_to_feature:
        feat = item_to_feature[idx]
    else:
        missing_items.append(idx)
        feat = torch.randn(768)
    features_orderd.append(feat)

In [44]:
len(missing_items)

12

In [47]:
all_features_ordered = torch.stack(features_orderd)

In [48]:
all_features_ordered.shape

torch.Size([76747, 768])

In [49]:
import torch.nn as nn


emb = nn.Embedding.from_pretrained(all_features_ordered, freeze=True)

In [50]:
torch.save(emb, "siglip_ordered_embedding.pth")

In [None]:
# test loading
siglip_emb = torch.load("siglip_ordered_embedding.pth", weights_only=False)