In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%time
import zipfile
for i in [0, 1, 2, 3]:
    with zipfile.ZipFile(f'drive/MyDrive/Signate-OffroadSegmentation/data/train_images_A_{i}.zip') as existing_zip:
        existing_zip.extractall('train_images')

with zipfile.ZipFile(f'drive/MyDrive/Signate-OffroadSegmentation/data/train_annotations_A.zip') as existing_zip:
    existing_zip.extractall('train_annotations')

!cp -r drive/MyDrive/OffroadSegmentation/data/precision_test_images precision_test_images

!pip install segmentation-models-pytorch 
!pip install -U git+https://github.com/albu/albumentations --no-cache-dir
!pip install imagehash

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import glob

from torch.utils.data import DataLoader
from torch.utils.data import Dataset as BaseDataset

import torch
import numpy as np

import albumentations as albu
from tqdm import tqdm
import imagehash
from PIL import Image

## 1. imagehash


In [None]:
%%time
def run(png_list):

    funcs = [
        imagehash.average_hash,
        imagehash.phash,
        imagehash.dhash,
        imagehash.whash,
    ]

    petids = []
    hashes = []
    for path in tqdm(png_list):

        image = Image.open(path)
        imageid = path.split('/')[-1].split('.')[0]

        petids.append(imageid)
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))

    return petids, np.array(hashes)

png_list = glob.glob('train_images/train_images_*/*.png')
png_list.extend(glob.glob('precision_test_images/*.png'))
png_list = np.sort(png_list)

petids, hashes_all = run(png_list)
hashes_all = torch.Tensor(hashes_all.astype(int)).cuda()
sims = np.array([(hashes_all[i] == hashes_all).sum(dim=1).cpu().numpy()/256 for i in range(hashes_all.shape[0])])

threshold = 0.90
duplicates = np.where(sims > threshold)

In [None]:
# visualize

# count = 5
# tmp = 0
# paths = png_list

# pairs = {}
# for i,j in zip(*duplicates):
#     if i == j:
#         continue

#     path1 = paths[i]
#     path2 = paths[j]
#     print(path1)
#     print(path2)
#     print(sims[i, j])

#     image1 = cv2.imread(path1)
#     image2 = cv2.imread(path2)

#     if image1.shape[0] > image1.shape[1] / 2:
#         fig,ax = plt.subplots(figsize=(20,20), ncols=2)
#     elif image1.shape[1] > image1.shape[0] / 2:
#         fig,ax = plt.subplots(figsize=(20,20), nrows=2)
#     else:
#         fig,ax = plt.subplots(figsize=(20,30), nrows=2)
#     ax[0].imshow(image1)
#     ax[1].imshow(image2)
#     plt.show()
    
#     tmp += 1
#     if tmp > count:
#         break

In [None]:
# find duplicate images
import networkx as nx
g1 = nx.Graph()
for i, j in tqdm(zip(*duplicates)):
    g1.add_edge(i, j)

duplicates_groups = list(list(x) for x in nx.connected_components(g1))

In [None]:
# make df_imagehash
df_imagehash = pd.DataFrame()
df_imagehash['png_name'] = png_list

df_imagehash['duplicate_id'] = 999
for i, d_grp in enumerate(duplicates_groups):
    df_imagehash.loc[d_grp, 'duplicate_id'] = i

df_imagehash['type'] = df_imagehash['png_name'].str.split('/', expand=True)[0]
df_nunique = df_imagehash.groupby('duplicate_id')['type'].nunique().reset_index().rename(columns={'type':'nunique_type'}) # testとtrainの両方にあるか否か
df_imagehash = pd.merge(df_imagehash, df_nunique, how='left', on='duplicate_id')

In [None]:
# make 5fold

df_imagehash['fold'] = 999
dup_vc = df_imagehash[(df_imagehash['type'] == 'train_images') & 
                      (df_imagehash['nunique_type'] == 1)]['duplicate_id'].value_counts()

for i in range(5):
    df_imagehash.loc[df_imagehash['duplicate_id'].isin(dup_vc.index[i::5]), 'fold'] = i
    df_imagehash.loc[(df_imagehash.index % 5 == i) & (df_imagehash['duplicate_id'] == 999), 'fold'] = i

# 2. add annotation area info

In [None]:
# add annotation area info

anno_list = {'road': [128, 64, 128], 
             'dirt road': [255, 128, 128],
             'other obstacle': [0, 0, 70]}
png_l = glob.glob('train_images/train_images_*/*.png')

df_train = pd.DataFrame()
df_train['png_name'] = np.sort(png_l)
df_train['annotation'] = np.sort(glob.glob('train_annotations/train_annotations_*/*.png'))
df_train['road'] = 0
df_train['dirt road'] = 0
df_train['other obstacle'] = 0

for i, row in tqdm(df_train.iterrows()):
    for category in ['road', 'dirt road', 'other obstacle']:
        mask = cv2.imread(row['annotation'])
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)
        
        a = anno_list[category]
        df_train.loc[i, category] = ((mask[:, :, 0] == a[0]) & (mask[:, :, 1] == a[1]) & (mask[:, :, 2] == a[2])).sum()


In [None]:
df = pd.merge(df_imagehash, df_train, on='png_name', how='left')
df.to_csv('drive/MyDrive/Signate-OffroadSegmentation/data/5fold_validation.csv')

In [None]:
df