In [7]:
import os
import sys
import random
import math
import re
import time
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd


## Utils

In [9]:
def json2df(data):
    df = pd.DataFrame()
    for index, el in enumerate(data):
        for key, val in el.items():
            df.loc[index, key] = val
    return df

In [10]:
def classid2label(class_id):
    category, *attribute = class_id.split("_")
    return category, attribute

In [11]:
pallete =  [
    'Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2',
    'Set1', 'Set2', 'Set3', 'tab10', 'tab20', 'tab20b', 'tab20c']


def make_mask_img(segment_df):
    category_num = len(counter_category)
    seg_width = segment_df.at[0, "Width"]
    seg_height = segment_df.at[0, "Height"]
    seg_img = np.full(seg_width*seg_height, category_num-1, dtype=np.uint8)
    for encoded_pixels, class_id in zip(segment_df["EncodedPixels"].values, segment_df["ClassId"].values):
        pixel_list = list(map(int, encoded_pixels.split(" ")))
        for i in range(0, len(pixel_list), 2):
            start_index = pixel_list[i] - 1
            index_len = pixel_list[i+1] - 1
            seg_img[start_index:start_index+index_len] =\
                int(int(class_id.split("_")[0]) / (category_num-1) * 255)
    seg_img = seg_img.reshape((seg_height, seg_width), order='F')
    return seg_img


def train_generator(df, batch_size):
    img_ind_num = df.groupby("ImageId")["ClassId"].count()
    index = df.index.values[0]
    trn_images = []
    seg_images = []
    for i, (img_name, ind_num) in enumerate(img_ind_num.items()):
        img = cv2.imread("../data/train/" + img_name)
        segment_df = (df.loc[index:index+ind_num-1, :]).reset_index(drop=True)
        index += ind_num
        if segment_df["ImageId"].nunique() != 1:
            raise Exception("Index Range Error")
        seg_img = make_mask_img(segment_df)
        
        # HWC -> CHW
        img = img.transpose((2, 0, 1))
        
        trn_images.append(img)
        seg_images.append(seg_img)
        if((i+1) % batch_size == 0):
            return trn_images, seg_images

## Load Data

In [14]:
data_root = "../data/iMaterialist/"

In [16]:
train_df = pd.read_csv(f"{data_root}/train.csv",sep=",")
label_json = json.load(open(f"{data_root}/label_descriptions.json"))
category_df = json2df(label_json["categories"])
attributes_df = json2df(label_json["attributes"])

In [17]:
train_df.shape

(333415, 5)

In [18]:
attributes_df.shape

(92, 4)

In [19]:
category_df.head()

Unnamed: 0,id,name,supercategory,level
0,0.0,"shirt, blouse",upperbody,2.0
1,1.0,"top, t-shirt, sweatshirt",upperbody,2.0
2,2.0,sweater,upperbody,2.0
3,3.0,cardigan,upperbody,2.0
4,4.0,jacket,upperbody,2.0


In [20]:
train_df.head()

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32


In [21]:
train_df["Category"] = train_df.ClassId.str.split("_", expand=True)[0]
train_df["Num_attr"]= 10 - train_df.ClassId.str.split("_", expand=True).loc[:,1:].isnull().astype(int).sum(axis=1)
train_df.Category = train_df.Category.astype(int)

In [22]:
train_df.head()

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,0,0
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28,28,0
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31,31,0
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0


In [12]:
no_attr = train_df.query("Num_attr == 0")

In [115]:
attrs_df

Unnamed: 0,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,attr_10
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
333410,,,,,,,,,,
333411,,,,,,,,,,
333412,,,,,,,,,,
333413,,,,,,,,,,


In [13]:
attrs_df = train_df.ClassId.str.split("_",expand=True).loc[:, 1:10]
attrs_df.columns = [f"attr_{i}" for i in attrs_df.columns]

attrs_df 

Unnamed: 0,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,attr_10
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
333410,,,,,,,,,,
333411,,,,,,,,,,
333412,,,,,,,,,,
333413,,,,,,,,,,


In [14]:
train_df_w_attrs = train_df.join(attrs_df)
train_df_w_attrs.head()

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,attr_10
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,,,,,,,,
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,0,0,,,,,,,,,,
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28,28,0,,,,,,,,,,
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31,31,0,,,,,,,,,,
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0,,,,,,,,,,


In [15]:
attr_one_col = train_df_w_attrs.melt(id_vars=['ImageId', 'EncodedPixels', 'Height', 'Width', 'ClassId', 'Category', 'Num_attr'], var_name="attr_position", value_name="attr")
attr_one_col = attr_one_col[~attr_one_col.isnull().any(axis=1)]
attr_one_col = pd.concat([no_attr, attr_one_col],ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
attr_one_col.loc[attr_one_col.isnull().all(axis=1),:]

Unnamed: 0,Category,ClassId,EncodedPixels,Height,ImageId,Num_attr,Width,attr,attr_position


In [17]:
attr_one_col.columns

Index(['Category', 'ClassId', 'EncodedPixels', 'Height', 'ImageId', 'Num_attr',
       'Width', 'attr', 'attr_position'],
      dtype='object')

In [18]:
attr_one_col = attr_one_col.loc[:,['ImageId', 'EncodedPixels', 'Height','Width', 'ClassId','Category', 'Num_attr', 'attr_position','attr']]
attr_one_col

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr,attr_position,attr
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,0,0,,
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28,28,0,,
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31,31,0,,
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0,,
...,...,...,...,...,...,...,...,...,...
393602,613da66e89ed032f7590638e7614d252.jpg,3156045 1 3160364 4 3164683 6 3169003 7 317332...,4320,2880,10_3_14_20_24_52_53_62_67_72_87,10,10,attr_10,87
393603,7b9cc73c92066a2ef3849a831f9e5ef1.jpg,267776 11 268792 34 269809 52 270826 67 271843...,1024,683,6_8_14_20_33_41_60_63_68_72_88,6,10,attr_10,88
393604,ef696a77808507c1b6e162b4f9480a0d.jpg,310127 8 311478 19 312831 28 314183 38 315535 ...,1361,907,10_3_10_12_14_20_33_60_61_69_87,10,10,attr_10,87
393605,f24374e1419a8bf66f360877602db59a.jpg,208482 10 209506 22 210530 23 211554 24 212578...,1024,683,10_5_14_20_24_52_56_62_66_70_87,10,10,attr_10,87


In [19]:
pd.get_dummies(attr_one_col.query("Num_attr == 2 and ImageId == '000e973c99dc090afd7898c93daf0dbc.jpg'"),columns=["attr"])

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr,attr_position,attr_41,attr_61
321880,000e973c99dc090afd7898c93daf0dbc.jpg,2022977 2 2027903 8 2032828 14 2037754 19 2042...,4928,3264,4_41_61,4,2,attr_1,1,0
333420,000e973c99dc090afd7898c93daf0dbc.jpg,2022977 2 2027903 8 2032828 14 2037754 19 2042...,4928,3264,4_41_61,4,2,attr_2,0,1


In [20]:
test = attr_one_col.query("ImageId == '00000663ed1ff0c4e0132b9b9ac53f6e.jpg'")
test

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr,attr_position,attr
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,0,0,,
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28,28,0,,
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31,31,0,,
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0,,
5,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,11464158 10 11469372 30 11474586 43 11479800 4...,5214,3676,32,32,0,,
6,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,5910568 31 5915775 65 5920982 69 5926189 73 59...,5214,3676,31,31,0,,
7,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6421446 292 6426657 298 6431867 305 6437078 31...,5214,3676,29,29,0,,
8,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,4,0,,


In [21]:
test.groupby("ImageId").apply(lambda x: x.merge(x, on="ImageId", suffixes=["_ref", "_paired"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,ImageId,EncodedPixels_ref,Height_ref,Width_ref,ClassId_ref,Category_ref,Num_attr_ref,attr_position_ref,attr_ref,EncodedPixels_paired,Height_paired,Width_paired,ClassId_paired,Category_paired,Num_attr_paired,attr_position_paired,attr_paired
ImageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,6323163 11 6328356 32 6333549 53 6338742 75 63...,5214,3676,0,0,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,8521389 10 8526585 30 8531789 42 8537002 46 85...,5214,3676,28,28,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,12903854 2 12909064 7 12914275 10 12919485 15 ...,5214,3676,31,31,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6068157 7 6073371 20 6078584 34 6083797 48 608...,5214,3676,6,6,0,,,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,76,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,4,0,,,10837337 5 10842542 14 10847746 24 10852951 33...,5214,3676,32,32,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,77,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,4,0,,,11464158 10 11469372 30 11474586 43 11479800 4...,5214,3676,32,32,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,78,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,4,0,,,5910568 31 5915775 65 5920982 69 5926189 73 59...,5214,3676,31,31,0,,
00000663ed1ff0c4e0132b9b9ac53f6e.jpg,79,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4566382 8 4571592 25 4576803 41 4582013 58 458...,5214,3676,4,4,0,,,6421446 292 6426657 298 6431867 305 6437078 31...,5214,3676,29,29,0,,


In [22]:
attr_one_col_clean = attr_one_col.drop(columns = ["EncodedPixels", "Height", "Width", "Num_attr"])

In [23]:
all_pairs = attr_one_col_clean.groupby("ImageId").apply(lambda x: x.merge(x, on="ImageId", suffixes=["_ref", "_paired"]))
all_pairs = all_pairs.query("Category_ref != Category_paired").reset_index(drop=True)

In [24]:
all_pairs

Unnamed: 0,ImageId,ClassId_ref,Category_ref,attr_position_ref,attr_ref,ClassId_paired,Category_paired,attr_position_paired,attr_paired
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,6,,,0,0,,
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,6,,,28,28,,
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,6,,,31,31,,
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,6,,,32,32,,
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,6,,,32,32,,
...,...,...,...,...,...,...,...,...,...
3707569,ffffbf7014a9e408bfbb81a75bc70638.jpg,31,31,,,33,33,,
3707570,ffffbf7014a9e408bfbb81a75bc70638.jpg,31,31,,,10,10,,
3707571,ffffbf7014a9e408bfbb81a75bc70638.jpg,10,10,,,33,33,,
3707572,ffffbf7014a9e408bfbb81a75bc70638.jpg,10,10,,,31,31,,


In [25]:
one_hot_encoded = pd.get_dummies(all_pairs, dummy_na=True, columns = [ "Category_ref","attr_ref", "attr_paired", "Category_paired"])
one_hot_encoded

Unnamed: 0,ImageId,ClassId_ref,attr_position_ref,ClassId_paired,attr_position_paired,Category_ref_0.0,Category_ref_1.0,Category_ref_2.0,Category_ref_3.0,Category_ref_4.0,...,Category_paired_37.0,Category_paired_38.0,Category_paired_39.0,Category_paired_40.0,Category_paired_41.0,Category_paired_42.0,Category_paired_43.0,Category_paired_44.0,Category_paired_45.0,Category_paired_nan
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,0,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,28,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,31,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,32,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,32,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3707569,ffffbf7014a9e408bfbb81a75bc70638.jpg,31,,33,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3707570,ffffbf7014a9e408bfbb81a75bc70638.jpg,31,,10,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3707571,ffffbf7014a9e408bfbb81a75bc70638.jpg,10,,33,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3707572,ffffbf7014a9e408bfbb81a75bc70638.jpg,10,,31,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
one_hot_encoded.query("ImageId == '00000663ed1ff0c4e0132b9b9ac53f6e.jpg'")

Unnamed: 0,ImageId,ClassId_ref,attr_position_ref,ClassId_paired,attr_position_paired,Category_ref_0.0,Category_ref_1.0,Category_ref_2.0,Category_ref_3.0,Category_ref_4.0,...,Category_paired_37.0,Category_paired_38.0,Category_paired_39.0,Category_paired_40.0,Category_paired_41.0,Category_paired_42.0,Category_paired_43.0,Category_paired_44.0,Category_paired_45.0,Category_paired_nan
0,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,0,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,28,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,31,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,32,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,6,,32,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4,,31,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
64,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4,,32,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
65,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4,,32,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
66,00000663ed1ff0c4e0132b9b9ac53f6e.jpg,4,,31,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
one_hot_encoded.shape

(3707574, 285)

In [28]:
index_col, ref_cols, paired_cols = [["ImageID"], 
                                    [ref_col for ref_col in one_hot_encoded.columns if "ref" in ref_col], 
                                    [paired_col for paired_col in one_hot_encoded.columns if "paired" in paired_col]]


In [29]:
X = one_hot_encoded.reset_index(drop=True).drop(columns=["attr_position_ref", "ImageId", "ClassId_ref"] + paired_cols)
y = one_hot_encoded.reset_index(drop=True).drop(columns=["attr_position_paired", "ImageId", "ClassId_paired"] + ref_cols)



In [30]:
X.dtypes

Category_ref_0.0    uint8
Category_ref_1.0    uint8
Category_ref_2.0    uint8
Category_ref_3.0    uint8
Category_ref_4.0    uint8
                    ...  
attr_ref_89         uint8
attr_ref_9          uint8
attr_ref_90         uint8
attr_ref_91         uint8
attr_ref_nan        uint8
Length: 140, dtype: object

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)


In [39]:
X_test.to_csv("Rec_sys_set/X_test",sep="\t", index=False)
y_train.to_csv("Rec_sys_set/y_train",sep="\t", index=False)
y_test.to_csv("Rec_sys_set/y_test",sep="\t", index=False)

In [43]:
one_hot_encoded.query("attr_ref_nan == 0 and attr_paired_nan == 0").drop(columns=["attr_position_ref", "ImageId", "ClassId_ref", "attr_position_paired", "ClassId_paired"]).to_csv("Rec_sys_set/OHE_w_attrs_only",sep="\t",index=False)

In [None]:
X_no_nan = one_hot_encoded.reset_index(drop=True).drop(columns=["attr_position_ref", "ImageId", "ClassId_ref"] + paired_cols)
y = one_hot_encoded.reset_index(drop=True).drop(columns=["attr_position_paired", "ImageId", "ClassId_paired"] + ref_cols)


In [101]:
at_least_one_attr = attr_one_col.query("Num_attr != 0")
at_least_one_attr_dummies = pd.get_dummies(at_least_one_attr, columns=["attr"])
at_least_one_attr_dummies = at_least_one_attr_dummies.groupby(["ImageId", "ClassId", "Category"]).sum().reset_index().drop(columns= ["Height","Width","Num_attr"])
at_least_one_attr_dummies.loc[:,"attr_0":] = np.where(at_least_one_attr_dummies.loc[:,"attr_0":] >0, 1, 0)

all_pairs_atleast_one_atr = at_least_one_attr_dummies.groupby("ImageId").apply(lambda x: x.merge(x, on="ImageId", suffixes=["_ref", "_paired"]))
all_pairs_atleast_one_atr = all_pairs_atleast_one_atr.query("ClassId_ref != ClassId_paired").reset_index(drop=True)
print(all_pairs_atleast_one_atr.shape)

#sample check to see possible pairings for one image
display(all_pairs_atleast_one_atr.query("ImageId == '000e973c99dc090afd7898c93daf0dbc.jpg'")[["Category_ref", "Category_paired"]])
all_pairs_atleast_one_atr = pd.get_dummies(all_pairs_atleast_one_atr, columns = ["Category_ref", "Category_paired"])


(11952, 189)


Unnamed: 0,Category_ref,Category_paired
2,10,4
3,10,4
4,10,6
5,4,10
6,4,4
7,4,6
8,4,10
9,4,4
10,4,6
11,6,10


In [102]:
at_least_one_attr.query("attr == 2")#.loc[(at_least_one_attr["attr"] == 2),:]

Unnamed: 0,ImageId,EncodedPixels,Height,Width,ClassId,Category,Num_attr,attr_position,attr


In [103]:
at_least_one_attr_dummies.loc[(at_least_one_attr_dummies.loc[:,"attr_0":] ==2).any(axis=1),:]

Unnamed: 0,ImageId,ClassId,Category,attr_0,attr_1,attr_10,attr_11,attr_12,attr_13,attr_14,...,attr_83,attr_84,attr_85,attr_86,attr_87,attr_88,attr_89,attr_9,attr_90,attr_91


In [104]:
at_least_one_attr_dummies

Unnamed: 0,ImageId,ClassId,Category,attr_0,attr_1,attr_10,attr_11,attr_12,attr_13,attr_14,...,attr_83,attr_84,attr_85,attr_86,attr_87,attr_88,attr_89,attr_9,attr_90,attr_91
0,000aac3870ea7c59ca0333ffa5327323.jpg,10_3_20_34_56_69_91,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,000aac3870ea7c59ca0333ffa5327323.jpg,3_1_10_20_41_60_61_91,3,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,000b3a87508b0fa185fbd53ecbe2e4c6.jpg,10_5_19_33_59_61_66_88,10,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,000e973c99dc090afd7898c93daf0dbc.jpg,10_7_18_20_34_53_61,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,000e973c99dc090afd7898c93daf0dbc.jpg,4_0_20_41_60_72_88,4,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11521,fff36676cdac2dd8d057783f0e9f24cc.jpg,3_6_11_19_40_60_61_88,3,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11522,fff36676cdac2dd8d057783f0e9f24cc.jpg,8_3_20_33_60_61,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11523,fff65379ba62e3f6498a20e9cc2a99fe.jpg,1_0_20_41_49_61,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11524,fff65379ba62e3f6498a20e9cc2a99fe.jpg,6_8_15_20_40_60_61_87,6,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [107]:
all_pairs_atleast_one_atr

Unnamed: 0,ImageId,ClassId_ref,attr_0_ref,attr_1_ref,attr_10_ref,attr_11_ref,attr_12_ref,attr_13_ref,attr_14_ref,attr_15_ref,...,Category_paired_6,Category_paired_7,Category_paired_8,Category_paired_9,Category_paired_10,Category_paired_11,Category_paired_12,Category_paired_27,Category_paired_28,Category_paired_33
0,000aac3870ea7c59ca0333ffa5327323.jpg,10_3_20_34_56_69_91,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,000aac3870ea7c59ca0333ffa5327323.jpg,3_1_10_20_41_60_61_91,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,000e973c99dc090afd7898c93daf0dbc.jpg,10_7_18_20_34_53_61,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,000e973c99dc090afd7898c93daf0dbc.jpg,10_7_18_20_34_53_61,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,000e973c99dc090afd7898c93daf0dbc.jpg,10_7_18_20_34_53_61,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11947,fff65379ba62e3f6498a20e9cc2a99fe.jpg,1_0_20_41_49_61,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11948,fff65379ba62e3f6498a20e9cc2a99fe.jpg,6_8_15_20_40_60_61_87,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
11949,fff65379ba62e3f6498a20e9cc2a99fe.jpg,6_8_15_20_40_60_61_87,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
11950,fff65379ba62e3f6498a20e9cc2a99fe.jpg,9_3_10_20_41_60_61_91,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
all_pairs_atleast_one_atr.groupby("ImageId").count()

Unnamed: 0_level_0,ClassId_ref,attr_0_ref,attr_1_ref,attr_10_ref,attr_11_ref,attr_12_ref,attr_13_ref,attr_14_ref,attr_15_ref,attr_16_ref,...,Category_paired_6,Category_paired_7,Category_paired_8,Category_paired_9,Category_paired_10,Category_paired_11,Category_paired_12,Category_paired_27,Category_paired_28,Category_paired_33
ImageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000aac3870ea7c59ca0333ffa5327323.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
000e973c99dc090afd7898c93daf0dbc.jpg,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
00211c06b1fe730097dde122cd4d3f8c.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
00213526750c326a6d8cac9d44b458de.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
004add927302571a7067932d553909e3.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffdb5284c3d1b609a2212c8b86ca6c6e.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
ffe8150a195e3aaa72640d7730a811a1.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
ffec8295f37df6ea12eecbb60d2c23d4.jpg,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
fff36676cdac2dd8d057783f0e9f24cc.jpg,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6


In [111]:
all_pairs_atleast_one_atr.query("ImageId == '000e973c99dc090afd7898c93daf0dbc.jpg'")[["ClassId_ref", "ClassId_paired"]]

Unnamed: 0,ClassId_ref,ClassId_paired
2,10_7_18_20_34_53_61,4_0_20_41_60_72_88
3,10_7_18_20_34_53_61,4_41_61
4,10_7_18_20_34_53_61,6_8_20_39_60_70
5,4_0_20_41_60_72_88,10_7_18_20_34_53_61
6,4_0_20_41_60_72_88,4_41_61
7,4_0_20_41_60_72_88,6_8_20_39_60_70
8,4_41_61,10_7_18_20_34_53_61
9,4_41_61,4_0_20_41_60_72_88
10,4_41_61,6_8_20_39_60_70
11,6_8_20_39_60_70,10_7_18_20_34_53_61


In [84]:
at_least_one_attr_dummies.loc[(at_least_one_attr_dummies.loc[:,"attr_0":] ==2).any(axis=1),:]

Unnamed: 0,ImageId,ClassId,Category,attr_0,attr_1,attr_10,attr_11,attr_12,attr_13,attr_14,...,attr_83,attr_84,attr_85,attr_86,attr_87,attr_88,attr_89,attr_9,attr_90,attr_91


In [112]:
all_pairs_atleast_one_atr.to_csv("Rec_sys_set/OHE_all_pairs_atleast_one_attr", sep="\t", index = False)

In [None]:
all_pairs_atleast_one_atr

In [None]:
all_pairs_atleast_one_atr

In [None]:
X_train_no_nan, X_test_no_nan, y_train_no_nan, y_test_no_nan

In [32]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

In [33]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

In [None]:
clf.fit(X_train, y_train)



In [None]:
y

In [37]:
attr_one_col.attr.isnull().sum()

321875

In [38]:
attr_one_col.attr.value_counts()

20    10773
61     8719
60     6396
41     3676
88     3569
      ...  
17       10
21        9
76        9
80        7
44        3
Name: attr, Length: 92, dtype: int64