In [1]:
import numpy as np 
import pandas as pd 

import math
import random 
import os 
import cv2
import timm

from tqdm import tqdm 
import h5py

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch.utils.data import Dataset 
from torch import nn
import torch.nn.functional as F 

import plotly.express as px

import gc
import cudf
import cuml
import cupy
from cuml.neighbors import NearestNeighbors
from annoy import AnnoyIndex

In [2]:
class Config:
    
    DATA_DIR = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
    TRAIN_CSV = '/mnt/hdd1/wearly/deep_rec/separ_train.csv'
    TEST_CSV = '/mnt/hdd1/wearly/deep_rec/separ_test.csv'
    SEED = 123

    IMG_SIZE = 224
    MEAN = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]

    BATCH_SIZE = 128
    N_FOLDS = 10
    FC_DIM = 512
    
    NUM_WORKERS = 4
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')#'cuda:0'
     
    CLASSES = 5062
    SCALE = 30 
    MARGIN = 0.5

    MODEL_NAME = 'tf_efficientnet_b3'
    MODEL_PATH = './separ_tf_efficientnet_b3_60_Weights/tf_efficientnet_b3_16EpochStep_adam.pt'
    TYP = "test"
    

def seed_setting(seed=Config.SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

In [3]:
# https://data-newbie.tistory.com/472
# 데이터 크기 확인 함수
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

## 타입별 평균 크기 확인 함수
def type_memory(data) :
    for dtype in ['float','int','object']:
        selected_dtype = data.select_dtypes(include=[dtype])
        mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
        mean_usage_mb = mean_usage_b / 1024 ** 2
        print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

## 이산형 데이터 사이즈 축소 함소
def int_memory_reduce(data) :
    data_int = data.select_dtypes(include=['int'])
    converted_int = data_int.apply(pd.to_numeric,downcast='unsigned')
    print(f"Before : {mem_usage(data_int)} -> After : {mem_usage(converted_int)}")
    data[converted_int.columns] = converted_int
    return data

## 연속형 데이터 사이즈 축소 함소
def float_memory_reduce(data) :
    data_float = data.select_dtypes(include=['float'])
    converted_float = data_float.apply(pd.to_numeric,downcast='float')
    print(f"Before : {mem_usage(data_float)} -> After : {mem_usage(converted_float)}")
    data[converted_float.columns] = converted_float
    return data

## 문자형 데이터 사이즈 축소 함소
def object_memory_reduce(data) :
    gl_obj = data.select_dtypes(include=['object']).copy()
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = gl_obj[col]
    print(f"Before : {mem_usage(gl_obj)} -> After : {mem_usage(converted_obj)}")
    data[converted_obj.columns] = converted_obj
    return data

In [4]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: frozenset(x.split()))
    y_pred = y_pred.apply(lambda x: frozenset(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in tqdm(zip(y_true, y_pred))])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1, intersection

def f1score_eval(df):
    df['f1'], intersection = f1_score(df['label_group'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our f1 score is {score}')
    return score, intersection

In [5]:
def gain_intersections(df):
    y_true = df['label_group']
    y_pred = df['pred_matches']
    y_true = y_true.apply(lambda x: frozenset(x.split()))
    #y_true = [frozenset(y_true[i].split()) for i in tqdm(range(leny_true))]
    y_pred = y_pred.apply(lambda x: frozenset(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in tqdm(zip(y_true, y_pred))])
    return intersection

In [6]:
# https://www.kaggle.com/code/tanulsingh077/metric-learning-image-tfidf-inference
def get_neighbors(df, embeddings, KNN = 30):
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < 2.7)[0]
        ids = indices[k,idx]
        posting_ids = df['image_name'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [7]:
h5f = h5py.File('../embeddings/all_emb.h5','r')
print("---------- Embedding Loading Successful ----------")
image_embeddings = h5f['all_data'][:]

# annoy indexing
vector_size = Config.FC_DIM
load_index = AnnoyIndex(vector_size, 'dot')
load_index.load('../embeddings/test.annoy')
print("---------- Annoy Model Loading Successful ----------")

---------- Embedding Loading Successful ----------
---------- Annoy Model Loading Successful ----------


In [8]:
df = pd.read_pickle("../embeddings/all_dt_eval.pkl")
df.pred_matches = df.pred_matches.astype("category")
df.label_group = df.label_group.astype("category")
print(df.shape)
df.head(3)

(559922, 3)


Unnamed: 0,image_name,label_group,pred_matches
0,train/manish_546453_best.jpg,train/manish_546453_best.jpg train/modern_7091...,train/street_963468_jacket.jpg train/street_13...
1,train/modern_709120_best.jpg,train/manish_546453_best.jpg train/modern_7091...,train/sophisticated_1138022_best.jpg val/sophi...
2,train/classic_746922_best.jpg,train/manish_546453_best.jpg train/modern_7091...,train/sophisticated_1138022_best.jpg train/mod...


In [9]:
te = pd.read_csv("../separ_test.csv", index_col=0)
te = te[["image_name"]]
print(te.shape)
te.head(3)

(55993, 1)


Unnamed: 0,image_name
0,train/modern_1220850_best.jpg
1,train/street_61802_best.jpg
2,val/country_881566_best.jpg


In [10]:
df = pd.merge(te, df, on="image_name")

In [11]:
# v_ls = [frozenset(df.label_group[i].split()) for i in tqdm(range(len(df.label_group)))]

In [12]:
intersection = gain_intersections(df)

KeyboardInterrupt: 

In [None]:
f1, intersection = f1score_eval(df)

In [None]:
df["intersection"] = intersection

In [None]:
df["precision"] = df["intersection"] / 20
df["precision"].mean()

In [None]:
ddf, predictions = get_neighbors(df, image_embeddings)

In [None]:
ddf["pred_matches"] = predictions
ddf['pred_matches'] = [' '.join(x) for x in tqdm(ddf['pred_matches'].values.tolist())]

In [None]:
f1, intersection = f1score_eval(ddf)

In [None]:
ddf["intersection"] = intersection
ddf["precision"] = ddf["intersection"] / 20
ddf["precision"].mean()