In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from PIL import Image
import matplotlib.pyplot as plt
import shutil

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from tqdm.notebook import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preparing the dataframe

In [32]:
df = pd.read_csv('/content/drive/MyDrive/liders_hack/pet-search/meta/result')
df.head()

Unnamed: 0,directopy,filename,is_animal_there,is_it_a_dog,is_the_owner_there,color,tail,address,cam_id
0,Датасет/Пустые/,6366.jpg,0.0,0.0,0.0,0.0,0.0,,
1,Датасет/Пустые/,348.jpg,0.0,0.0,0.0,0.0,0.0,,
2,Датасет/Пустые/,360.jpg,0.0,0.0,0.0,0.0,0.0,,
3,Датасет/Пустые/,228.jpg,0.0,0.0,0.0,0.0,0.0,,
4,Датасет/Пустые/,566.jpg,0.0,0.0,0.0,0.0,0.0,,


In [4]:
def find_mask(img):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/dog_masks'
    name, _ = img.split('.')
    name += '.pkl'
    path = os.path.join(base_path, name)
    if not os.path.isfile(path):
        # print(f'{img} path not found')
        return np.nan
    return path

def change_ext(name):
    img, ext = name.split('.')
    if ext == 'png':
        new_ext = 'jpg'
    elif ext == 'jpg':
        new_ext = 'png'
    return '.'.join([img, new_ext])

def find_img(img):
    base_path_0 = '/content/drive/MyDrive/liders_hack/Dataset_Masha/0'
    base_path_1 = '/content/drive/MyDrive/liders_hack/Dataset_Masha/1'
    for name in [img, change_ext(img)]:
        if os.path.isfile(os.path.join(base_path_0, name)):
            return os.path.join(base_path_0, name)
        elif os.path.isfile(os.path.join(base_path_1, name)):
            return os.path.join(base_path_1, name)
    print(f'{img} not found')
    return np.nan

def bbox_path(img_path):
    base = '/content/drive/MyDrive/liders_hack/datasets/dogs'
    _, img_path = os.path.split(img_path)
    return os.path.join(base, img_path)

In [5]:
def find_test_mask(img):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/test/dog_masks'
    name, _ = img.split('.')
    name += '.pkl'
    path = os.path.join(base_path, name)
    if not os.path.isfile(path):
        # print(f'{img} path not found')
        return np.nan
    return path

def find_test_img(img):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/test/all_pics'
    for name in [img, change_ext(img)]:
        if os.path.isfile(os.path.join(base_path, name)):
            return os.path.join(base_path, name)
    print(f'{img} not found')
    return np.nan

def find_test_sr_img(img):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/super_res/test/dogs'
    for name in [img, change_ext(img)]:
        if os.path.isfile(os.path.join(base_path, name)):
            return os.path.join(base_path, name)
    print(f'{img} not found')
    return np.nan

def test_bbox_path(img_path):
    base = '/content/drive/MyDrive/liders_hack/datasets/test/dogs'
    _, img_path = os.path.split(img_path)
    return os.path.join(base, img_path)

In [None]:
dogs = df.drop(df[df.is_it_a_dog == 0].index)
dogs['img_path'] = dogs.filename.apply(find_test_img)
dogs['dog_mask'] = dogs.filename.apply(find_test_mask)
dogs['img_name'] = dogs.filename

In [None]:
dogs = pd.concat([dogs['img_path'], dogs['dog_mask'], dogs['img_name'], dogs['color'], dogs['tail']], join='inner', axis=1)
dogs = dogs.dropna()
print(len(dogs))
dogs

### Generate dog pictures from bounding boxes

In [None]:
delta = np.zeros(2, dtype=np.int_)
cnt = 0
for idx, row in tqdm(dogs.iterrows()):
    img = Image.open(row['img_path'])
    with open(row['dog_mask'], 'rb') as f:
        target_instances = pickle.load(f)

    if len(target_instances[0][0]) == 0:
        print(os.path.split(row['img_path'])[1])
        continue
    bbox = np.asarray(target_instances[0][0][0][:4], dtype=np.int_)
    delta[0] = (bbox[2] - bbox[0]) // 10
    delta[1] = (bbox[3] - bbox[1]) // 10
    bbox[:2] = np.clip(bbox[:2] - delta, (0,0), img.size) 
    bbox[2:] = np.clip(bbox[2:] + delta, (0,0), img.size)
    
    crop = img.crop(bbox)
    crop = crop.convert('RGB')
    crop.save(test_bbox_path(row['img_path']))
    cnt += 1
print(cnt)

In [None]:
cnt

954

### For dogs only

In [35]:
def find_test_sr_dog_img(filename):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/super_res/test/dogs'
    sr_img = sr_dog_img_name(filename)
    for name in [sr_img, change_ext(sr_img)]:
        if os.path.isfile(os.path.join(base_path, name)):
            return os.path.join(base_path, name)
    # print(f'{img} not found')
    return np.nan

def find_sr_dog_img(filename):
    base_path = '/content/drive/MyDrive/liders_hack/datasets/super_res/dogs_new'
    sr_img = sr_dog_img_name(filename)
    for name in [sr_img, change_ext(sr_img)]:
        if os.path.isfile(os.path.join(base_path, name)):
            return os.path.join(base_path, name)
    # print(f'{img} not found')
    return np.nan

def sr_dog_img_name(filename):
    name, ext = filename.split('.')
    name += '_out'
    return '.'.join([name, ext])

def sr_img_dog_existing_name(img_path):
    if os.path.isfile(img_path):
        folder, name = os.path.split(img_path)
        return name
    elif os.path.isfile(change_ext(img_path)):
        folder, name = os.path.split(change_ext(img_path))
        return name
    return np.nan

In [36]:
dogs = df.drop(df[df.is_it_a_dog==0].index)
dogs['img_path'] = dogs.filename.apply(find_sr_dog_img)
dogs = dogs.dropna(subset=['img_path'])
dogs['img_name'] = dogs['img_path'].apply(sr_img_dog_existing_name)
dogs

Unnamed: 0,directopy,filename,is_animal_there,is_it_a_dog,is_the_owner_there,color,tail,address,cam_id,img_path,img_name
324,Датасет/Только собака/ТОЛЬКО СОБАКА -разноцвет...,638.jpg,1.0,1.0,0.0,0.0,1.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,638_out.jpg
325,Датасет/Только собака/ТОЛЬКО СОБАКА -разноцвет...,162.jpg,1.0,1.0,0.0,0.0,1.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,162_out.jpg
326,Датасет/Только собака/ТОЛЬКО СОБАКА -разноцвет...,6574.jpg,1.0,1.0,0.0,0.0,1.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,6574_out.jpg
327,Датасет/Только собака/ТОЛЬКО СОБАКА -разноцвет...,764.jpg,1.0,1.0,0.0,0.0,1.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,764_out.jpg
329,Датасет/Только собака/ТОЛЬКО СОБАКА -разноцвет...,6777.jpg,1.0,1.0,0.0,0.0,1.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,6777_out.jpg
...,...,...,...,...,...,...,...,...,...,...,...
1736,Датасет/Хозяин и собака/,6358.jpg,1.0,1.0,1.0,0.0,0.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,6358_out.jpg
1738,Датасет/Хозяин и собака/,6402.jpg,1.0,1.0,1.0,0.0,0.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,6402_out.jpg
1739,Датасет/Хозяин и собака/,6416.jpg,1.0,1.0,1.0,0.0,0.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,6416_out.jpg
1740,Датасет/Хозяин и собака/,2116.jpg,1.0,1.0,1.0,0.0,0.0,,,/content/drive/MyDrive/liders_hack/datasets/su...,2116_out.jpg


### Generate dog colors and tails image folders

In [37]:
base = '/content/drive/MyDrive/liders_hack/datasets/super_res/dogs_new'
base_colors = '/content/drive/MyDrive/liders_hack/datasets/super_res/colors'
base_tails = '/content/drive/MyDrive/liders_hack/datasets/super_res/tails'

for img in tqdm(os.listdir(base)):
    raw = dogs[dogs['img_name']==img]
    shutil.copy2(os.path.join(base, img), os.path.join(base_colors, str(int(raw['color']))))
    shutil.copy2(os.path.join(base, img), os.path.join(base_tails, str(int(raw['tail']))))

  0%|          | 0/954 [00:00<?, ?it/s]

### Generate dogs width padding
Requiremed size for all pretrained on ImageNet networks - `(224 x 224)`

In [None]:
base = '/content/drive/MyDrive/liders_hack/datasets/dogs'
base_colors = '/content/drive/MyDrive/liders_hack/datasets/padded_dogs/colors'
base_tails = '/content/drive/MyDrive/liders_hack/datasets/padded_dogs/tails'

target_size = np.array([224, 224])
bg_color = (0, 0, 0)

for img in tqdm(os.listdir(base)):
    raw = dogs[dogs['img_name']==img]
    pil_img = Image.open(os.path.join(base, img))
    size = np.asarray(pil_img.size)
    sizes_to_compare = np.vstack([target_size, size])
    new_size = np.max(sizes_to_compare, axis=0)
    new_img = Image.new(pil_img.mode, tuple(new_size), color=bg_color)
    new_img.paste(pil_img)
    new_img.save(os.path.join(base_colors, str(int(raw['color'])), img))
    new_img.save(os.path.join(base_tails, str(int(raw['tail'])), img))

  0%|          | 0/954 [00:00<?, ?it/s]