In [None]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
import tqdm
import duckdb
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import gc
import matplotlib.pyplot as plt
import glob
from torchsummary import summary



class ShopeeModel(nn.Module):
    def __init__(self, model_name, pretrained, fc_dim=512):
        super(ShopeeModel, self).__init__()
        self.model_name = model_name
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        if model_name != "tf_efficientnet_b4":
            in_features = self.backbone.head.in_features
            self.backbone.head = nn.Identity()
        else:
            in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)

        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.vector_size = fc_dim

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        if self.model_name == "tf_efficientnet_b4":
            x = self.pooling(x)
        x = x.view(batch_size, -1)

        x = self.dropout(x)
        x = self.classifier(x)
        x = self.bn(x)

        x = F.normalize(x)
        return x



def get_images_name(set_type):
    with open(f'./valid_{set_type}_images.txt', 'r') as f:
        images = list(
            map(
                lambda x: x.strip(),
                f.readlines()
            )
        )
    return images


def load_and_preprocess(path):
    img = cv2.imread(path)

    outs = [torch.FloatTensor(cv2.resize(img, size)).permute(2, 0, 1) / 255.0 for size in sizes]
    return outs


def split_set_into_n_chunks(txt_with_images_paths, n):
    with open(txt_with_images_paths, 'r') as f:
        paths = list(
            map(
                lambda x: x.strip(),
                f.readlines()
            )
        )
    paths.sort()
    chunk_size = len(paths) // n
    remainder = len(paths) % n

    chunks = []
    start = 0

    for i in range(n):
        end = start + chunk_size + (1 if i < remainder else 0)
        chunks.append(paths[start:end])
        start = end

    return chunks

def batch_images_generator(path_to_folder, bs, resume_from_batch=None, list_of_images=None):
    if list_of_images:
        images = list_of_images
    else:
        images = sorted(glob.glob('*.jpg', root_dir=path_to_folder))
        images = [image for image in images if os.path.getsize(path_to_folder + image) > 0]

    print(len(images))
    start = 0 * bs if not resume_from_batch else resume_from_batch * bs
    print(f'start from {start}')

    for i in tqdm.tqdm(range(start, len(images), bs)):
        batch_images = images[i:i + bs]

        batches = [
            [] for _ in range(len(models))
        ]

        valid_images = []
        for image in batch_images:
            try:
                preprocessed = load_and_preprocess(path_to_folder + image)

                for k, preprocessed_image in enumerate(preprocessed):
                    batches[k].append(preprocessed_image.unsqueeze(0))

                valid_images.append(image)
            except Exception as e:
                print(e)
                print(image)

        yield i, valid_images, batches


def save_parquet(filenames, embedded, path, i):

    df = pd.DataFrame({
        "filename": filenames,
        "embedding": embedded
    })

    df.to_parquet(
        os.path.join(path, f"batch_{i // bs}.parquet"),
        index=False
    )



   

In [None]:



class CFG:
    seed = 54
    classes = 11014
    scale = 30
    margin = 0.5
    model_name = 'tf_efficientnet_b4'
    fc_dim = 512
    img_size = 512
    batch_size = 20
    num_workers = 4
    device = 'cuda' if torch.cuda.is_available() else 'cpu'


device = 'cuda' if torch.cuda.is_available() else 'cpu'

img_backbones = [
    #"swin_base_patch4_window12_384", # 384
    'tf_efficientnet_b4', # 512
    "vit_base_r50_s16_384" # 384
]

img_model_paths = [
    #'./weights/top5/img_model_i15.pth',
    './weights/top5/img_model_i04.pth',
    './weights/top5/img_model_i11.pth'
]

models = [None for _ in range(len(img_backbones))]

sizes = [
    #(384, 384),
    (512, 512),
    (384, 384)
]

for i in range(len(img_backbones)):
    try:
        models[i] = ShopeeModel(img_backbones[i],  pretrained=False).to(CFG.device)
        models[i].load_state_dict(torch.load(img_model_paths[i]))
        models[i].eval()
        print(f'{i} completed')
    except Exception as e:
        print(e)
        continue

In [None]:
bs = 128
N_CHUNKS = 4
CURRENT_CHUNK_TO_PROCESS = 0
SET_TYPE = 'train'

images = get_images_name(SET_TYPE)[:1000]
path = rf'C:\avito\images/{SET_TYPE}/images/'
output_dir = fr'C:\avito\images\{SET_TYPE}\parquets/'
path_to_zip_paths = f'./{SET_TYPE}_images_zip_paths.txt'
#chunks = split_set_into_n_chunks(path_to_zip_paths, N_CHUNKS)
#images = chunks[CURRENT_CHUNK_TO_PROCESS]

for model_name in img_backbones:
    os.makedirs(output_dir + model_name, exist_ok=True)

os.makedirs(output_dir + 'concat', exist_ok=True)

for i, filenames, out in batch_images_generator(path_to_folder=path,
                                                bs=bs,
                                                list_of_images=images):

    result = []

    with torch.no_grad():
        for k, batch in enumerate(out):

            batch = torch.cat(batch)
            batch = batch.to(device)

            embedded = models[k](batch).cpu().numpy()

            result.append(embedded)

            del embedded
            gc.collect()
            torch.cuda.empty_cache()

    for k, res in enumerate(result):
        save_parquet(filenames, res.tolist(), output_dir + img_backbones[k], i)

    if len(result) > 1:
        concat = np.concatenate(result, axis=1)
        save_parquet(filenames, concat.tolist(), output_dir + 'concat', i)

for model_name in img_backbones:
    duckdb.sql(rf"""
        COPY (
            SELECT * FROM '{output_dir + model_name}/*.parquet'
        )
        TO '{output_dir}/final_{model_name}_chunk{CURRENT_CHUNK_TO_PROCESS}.parquet' (FORMAT PARQUET)
    """)
duckdb.sql(rf"""
    COPY (
        SELECT * FROM '{output_dir + 'concat'}/*.parquet'
    )
    TO '{output_dir}/final_concat_chunk{CURRENT_CHUNK_TO_PROCESS}.parquet' (FORMAT PARQUET)
""")