# Do I need to use `cv2.cvtColor(..., cv2.COLOR_BGR2RGB)`?
* __Reason:__
    * the 3rd place solution's code uses it:
    ```
    def preprocess_image(image_names, run_root=DATA_ROOT, out_root=OUTPUT_DIR, size=SIZE):
        for i in tqdm(range(len(image_names))):
            image_name = image_names[i]
            path = run_root+image_name
            img = cv2.imread(path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            H, W, C = img.shape
            new_H = int(SIZE)
            new_W = int(W/H*SIZE)
            img = cv2.resize(img, (new_W, new_H))
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            cv2.imwrite(OUTPUT_DIR + image_name, img)
    ```
    * Mine looks like this:
    ```
    def convert_images(filename, arch_out, file_type, out_shape=(640, 320)):
    """
    Reads an image and converts it to a desired file format
    """
    img = np.array(cv2.imread(filename))

    img = cv2.resize(img, out_shape)
    output = cv2.imencode(file_type, img)[1]
    name = f"{Path(filename).stem}{file_type}"
    arch_out.writestr(name, output)
    ```

The experiment here is to test the tangible differences between my pipeline and the 3rd place one. Both are:
* resized to (384, 576) ((576, 384) for cv2)
* saved as .jpg files in `dset_dir/output`

In [4]:
import os
import cv2

dset_dir = r"C:\Users\jchen\Desktop\Datasets\Understanding Clouds"
ex_img_fname = "test_image.jpg"

def preprocess_3rd_place(img_name, in_dir, out_dir, resize_size=(576, 384)):
    img = cv2.imread(os.path.join(in_dir, img_name))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, resize_size)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    return img
    
def preprocess_mine(img_name, in_dir, out_dir, resize_size=(576, 384)):
    img = cv2.imread(os.path.join(in_dir, img_name))
    img = cv2.resize(img, resize_size)
    return img

fn_kwargs = {
    "img_name": ex_img_fname,
    "in_dir": dset_dir,
    "out_dir": os.path.join(dset_dir, "output"),
    "resize_size": (576, 384),
}

In [6]:
import numpy as np
img_3rd = np.array(preprocess_3rd_place(**fn_kwargs))
img_mine = np.array(preprocess_mine(**fn_kwargs))
img_3rd.shape, img_mine.shape

((384, 576, 3), (384, 576, 3))

In [7]:
np.array_equal(img_3rd, img_mine)

True

There is no difference. Maybe resolution is a reason for the difference in performance?

# Testing Mask Creation (3rd Place v. Mine)

In [1]:
import os
from os.path import join
root_dset_dir = r"C:\Users\Joseph\kaggle_challenges\Understanding Clouds"
partial_dset, train_csv_path = join(root_dset_dir, "partial_dataset"), join(root_dset_dir, "train.csv")
sample_sub_path = join(root_dset_dir, "sample_submission.csv")
# importing clouds locally
repos_path = r"C:\Users\Joseph\kaggle_challenges\reproducing-cloud-3rd-place"
os.chdir(repos_path)
import clouds
os.chdir(r"C:\Users\Joseph")

In [2]:
from clouds.preprocess import Preprocessor
from clouds.experiments import setup_train_and_sub_df

size_str = "576_384"
config = {
    "paths_params": {
        "train_csv_path": train_csv_path,
        "sample_sub_csv_path": sample_sub_path,
        "train_dir": join(partial_dset, "train_images"),
        "test_dir": join(partial_dset, "test_images"),
        "train_out": join(partial_dset, f"train{size_str}.zip"),
        "test_out": join(partial_dset, f"test{size_str}.zip"),
        "mask_out": join(partial_dset, f"mask{size_str}.zip"),
    },
    "file_type": ".jpg",
    "out_shape_cv2": (576, 384),
}

def main(config):
    paths_params = config["paths_params"]
    paths_dict = {
        "train_dir": paths_params["train_dir"],
        "test_dir": paths_params["test_dir"],
        "train_out": paths_params["train_out"],
        "test_out": paths_params["test_out"],
        "mask_out": paths_params["mask_out"],
    }
    train, sub, _ = setup_train_and_sub_df(paths_params["train_csv_path"],
                                           paths_params["sample_sub_csv_path"])
    preprocessor = Preprocessor(train, paths_dict, tuple(config["out_shape_cv2"]),
                                config["file_type"])
    preprocessor.execute_masks()
    
main(config)


2 training images
2 test images


100%|██████████████████████████████████████| 5546/5546 [12:14<00:00,  6.03it/s]


In [None]:
import pandas as pd

df = pd.read_csv(train_csv_path)
df["im_id"] = df["Image_Label"].apply(lambda x: x.split("_")[0])
df.head()

In [8]:
# 3rd place solution
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm
from glob import glob

SIZE = 384

df_train = pd.read_csv(config["paths_params"]["train_csv_path"])
df_test = pd.read_csv(config["paths_params"]["sample_sub_csv_path"])
DATA_ROOT = config["paths_params"]["train_dir"]
OUTPUT_DIR = join(partial_dset, "train576_384_3rd_place")

image_names = os.listdir(DATA_ROOT)
# image_names = df_train['Image_Label'].apply(lambda x: x.split('_')[0]).unique().tolist()
# image_names += df_test['Image_Label'].apply(lambda x: x.split('_')[0]).unique().tolist()

print(f"Preprocessing {len(image_names)} training images.")

def preprocess_masks(image_names, df, run_root=DATA_ROOT, out_root=OUTPUT_DIR, size=SIZE):
    """
    Converts rles to masks and saves them as numpy arrays as `image_name` in `image_names`
    """
    pass

def resize_mask(df, size=SIZE):
    H = size
    W = int(3/2*H)
    df.fillna('', inplace=True)
    for i in tqdm(range(df.shape[0])):
        rle = df['EncodedPixels'].values[i]
        if rle != '':
            mask = rle2mask(rle, height=1400, width=2100, fill_value=1)
            mask = (cv2.resize(mask, (W, H)) > 0).astype(int)
            new_rle = mask2rle(mask)
        else:
            new_rle = rle
        df['EncodedPixels'].iloc[i] = new_rle
    df.to_csv(join(DATA_ROOT, f'train_{SIZE}.csv'), index=None)

def mask2rle(mask):  # 1:53
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = mask.T.flatten()
    if pixels.sum() == 0:
        rle = ''
    else:
        pixels = np.concatenate([[0], pixels, [0]])
        runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
        runs[1::2] -= runs[::2]
        rle = ' '.join(str(x) for x in runs)
    return rle

def rle2mask(rle, height=256, width=1600, fill_value=1):
    mask = np.zeros((height, width), np.float32)
    if rle != '':
        mask = mask.reshape(-1)
        r = [int(r) for r in rle.split(' ')]
        r = np.array(r).reshape(-1, 2)
        for start, length in r:
            start = start - 1  # ???? 0 or 1 index ???
            mask[start:(start + length)] = fill_value
        mask = mask.reshape(width, height).T
    return mask

resize_mask(df_train)

Preprocessing 2 training images.


100%|████████████████████████████████████| 22184/22184 [13:52<00:00, 26.64it/s]


In [107]:
def create_mask_3rd_place(df, image_name, shape):
    item = df_3rd.loc[df_3rd["Image_Label"] == image_name]
    index = item.index[0]
    h, w = shape[0], shape[1]
    mask = np.zeros((h, w, 4), dtype=np.float32)
    labels = item["Labels"][index]#.to_string(index=False)[1:]
    if len(labels) == 1:
        label = int(labels)
        rle = item['EncodedPixels'][index]
        mask[:, :, label] =  rle2mask(rle, h, w)
    else:
        labels = [int(x) for x in labels.split(' ')]
        rles = item['EncodedPixels'][index].split('|')
        for label, rle in zip(labels, rles):
            mask[:, :, label] = rle2mask(rle, h, w)
    return mask

def make_mask_resized_dset(df: pd.DataFrame, image_name: str="img.jpg",
                           masks_dir: str="./masks",
                           shape: tuple=(320, 640)):
    """
    Create mask based on df, image name and shape.
    """
    masks = np.zeros((shape[0], shape[1], 4), dtype=np.float32)
    df = df[df["im_id"] == image_name]
    for idx, im_name in enumerate(df["im_id"].values):
        for classidx, classid in enumerate(["Fish", "Flower", "Gravel", "Sugar"]):
            mask = cv2.imread(os.path.join(masks_dir, f"{classid}{im_name}"),
                              cv2.IMREAD_GRAYSCALE)
            if mask is None:
                continue
            # if mask[:,:,0].shape != (350,525):
            #     mask = cv2.resize(mask, (525,350))
            masks[:, :, classidx] = mask
    masks = masks/255
    return masks

In [111]:
# reading the images and seeing if they are the same
## EXAMPLE 1
import cv2
import numpy as np
import pandas as pd

mine_mask_dir = join(partial_dset, f"mask{size_str}")
img_name = image_names[0]
print(f"Image name: {img_name}")

df_train = pd.read_csv(config["paths_params"]["train_csv_path"])
# 3rd place dfs
file_dir = r"C:\Users\Joseph\Downloaded_Code\kaggle-cloud-organization-master\kaggle-cloud-organization-master\files"
fivefold = pd.read_csv(join(file_dir, "5-folds_384.csv"))
# fivefold["Labels"] = fivefold["Labels"].apply(str)
print(len(fivefold))

df_list = []
for img_name in image_names:
    df_list.append(fivefold.loc[fivefold["Image_Label"] == img_name])
df_3rd = pd.concat(df_list)
df_3rd.head()

Image name: 00a0954.jpg
5546


Unnamed: 0,Image_Label,Labels,Is_defect,EncodedPixels,fold
13,00a0954.jpg,2 3,1,62221 92 62605 92 62989 92 63373 92 63757 92 6...,2
14,00b81e1.jpg,1 2 3,1,121224 99 121608 99 121992 99 122376 99 122760...,1


In [112]:
df_train["im_id"] = df_train["Image_Label"].apply(lambda x: x.split("_")[0])

mine_mask = make_mask_resized_dset(df_train, img_name, mine_mask_dir, shape=(384, 576))
mask_3rd = create_mask_3rd_place(df_3rd, img_name, shape=(384, 576))

mine_mask.shape, mask_3rd.shape

((384, 576, 4), (384, 576, 4))

In [113]:
np.unique(mine_mask, return_counts=True), np.unique(mask_3rd, return_counts=True)

((array([0.        , 0.00392157, 0.00784314, 0.01176471, 0.01568628,
         0.01960784, 0.02352941, 0.02745098, 0.972549  , 0.9764706 ,
         0.98039216, 0.9843137 , 0.9882353 , 0.99215686, 0.99607843,
         1.        ], dtype=float32),
  array([653587,   1294,    450,    228,     95,     30,      4,      3,
              1,      6,     27,     76,    177,    379,   1601, 226778],
        dtype=int64)),
 (array([0., 1.], dtype=float32), array([654631, 230105], dtype=int64)))

In [116]:
np.unique((mine_mask > 0.9).astype(int), return_counts=True), np.unique(mask_3rd, return_counts=True)

((array([0, 1]), array([655691, 229045], dtype=int64)),
 (array([0., 1.], dtype=float32), array([654631, 230105], dtype=int64)))

## Question: If I write a binary array to a jpg and load it (+ threshold), will it yield the same array?
* With jpg? `No.`
* With png? `Yes.`

In [135]:
import cv2
import numpy as np

fname = "example_temp.jpg"
array = np.random.choice([0, 1], size=(1400, 2100))
array_resized = cv2.resize(array, (576, 384), 
                           interpolation=cv2.INTER_NEAREST)
print(f"Original Distribution: {np.unique(array_resized, return_counts=True)}")
cv2.imwrite(fname, array_resized)

read_array = (np.array(cv2.imread(fname, cv2.IMREAD_GRAYSCALE)) > 0.05).astype(int)
print(f"After reading the array: {np.unique(read_array, return_counts=True)}")
os.remove(fname)
np.array_equal(array_resized, read_array)

Original Distribution: (array([0, 1], dtype=int32), array([110457, 110727], dtype=int64))
After reading the array: (array([0, 1]), array([110831, 110353], dtype=int64))


False

# Comparing 3rd Place Pipeline with Mine (Images)

In [3]:
import os
from os.path import join
root_dset_dir = r"C:\Users\Joseph\kaggle_challenges\Understanding Clouds"
partial_dset, train_csv_path = join(root_dset_dir, "partial_dataset"), join(root_dset_dir, "train.csv")
sample_sub_path = join(root_dset_dir, "sample_submission.csv")
# importing clouds locally
repos_path = r"C:\Users\Joseph\kaggle_challenges\reproducing-cloud-3rd-place"
os.chdir(repos_path)
import clouds
os.chdir(r"C:\Users\Joseph")

In [4]:
from clouds.preprocess import Preprocessor
from clouds.experiments import setup_train_and_sub_df

size_str = "576_384"
config = {
    "paths_params": {
        "train_csv_path": train_csv_path,
        "sample_sub_csv_path": sample_sub_path,
        "train_dir": join(partial_dset, "train_images"),
        "test_dir": join(partial_dset, "test_images"),
        "train_out": join(partial_dset, f"train{size_str}.zip"),
        "test_out": join(partial_dset, f"test{size_str}.zip"),
        "mask_out": join(partial_dset, f"mask{size_str}.zip"),
    },
    "file_type": ".jpg",
    "out_shape_cv2": (576, 384),
}

def main(config):
    paths_params = config["paths_params"]
    paths_dict = {
        "train_dir": paths_params["train_dir"],
        "test_dir": paths_params["test_dir"],
        "train_out": paths_params["train_out"],
        "test_out": paths_params["test_out"],
        "mask_out": paths_params["mask_out"],
    }
    train, sub, _ = setup_train_and_sub_df(paths_params["train_csv_path"],
                                           paths_params["sample_sub_csv_path"])
    preprocessor = Preprocessor(train, paths_dict, tuple(config["out_shape_cv2"]),
                                config["file_type"])
    preprocessor.execute_train_test()
    
main(config)


2 training images
2 test images


100%|████████████████████████████████████████████| 2/2 [00:00<00:00,  3.71it/s]
100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 10.26it/s]


In [16]:
# 3rd place solution
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm
from glob import glob

SIZE = 384

df_train = pd.read_csv(config["paths_params"]["train_csv_path"])
df_test = pd.read_csv(config["paths_params"]["sample_sub_csv_path"])
DATA_ROOT = config["paths_params"]["train_dir"]
OUTPUT_DIR = join(partial_dset, "train576_384_3rd_place")

image_names = os.listdir(DATA_ROOT)
# image_names = df_train['Image_Label'].apply(lambda x: x.split('_')[0]).unique().tolist()
# image_names += df_test['Image_Label'].apply(lambda x: x.split('_')[0]).unique().tolist()

print(f"Preprocessing {len(image_names)} training images.")

def preprocess_image(image_names, run_root=DATA_ROOT, out_root=OUTPUT_DIR, size=SIZE):
    for i in tqdm(range(len(image_names))):
        image_name = image_names[i]
        img = cv2.imread(join(run_root, image_name))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        H, W, C = img.shape
        new_H = int(SIZE)
        new_W = int(W/H*SIZE)
        img = cv2.resize(img, (new_W, new_H))
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        out_path = join(out_root, image_name)
        cv2.imwrite(out_path, img)

preprocess_image(image_names)

Preprocessing 2 training images.


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 10.20it/s]


In [17]:
# reading the images and seeing if they are the same
## EXAMPLE 1
import cv2
import numpy as np

img_name = image_names[0]
print(f"Image name: {img_name}")
mine_img = np.array(cv2.imread(join(partial_dset, "train576_384", img_name)))
img_3rd = np.array(cv2.imread(join(partial_dset, "train576_384_3rd_place", img_name)))

mine_img.shape, img_3rd.shape

Image name: 00a0954.jpg


((384, 576, 3), (384, 576, 3))

In [18]:
np.array_equal(mine_img, img_3rd)

True