# Multi label image classification

We aim to do a multi label image classification. We have a dataset of images that can have more than one label assigned to them.

## Import the dataset

In [121]:
import pandas as pd
import numpy as np
import cv2
import imghdr
import os
import pathlib
from PIL import Image
from matplotlib import pyplot as plt

Read the dataset

In [122]:
df = pd.read_csv('./data/indian-images/multilabel_classification.csv')

# repalce the df header with the first row 
df = df.drop(df.index[0])

# reset the index
df.reset_index(inplace=True, drop=True)

# rename the columns
df.rename(columns={"Column1": "Image", "Column2": "Classes"}, inplace=True)

df.head()

Unnamed: 0,Image,Classes
0,image0.jpg,
1,image1.jpg,bus person
2,image2.jpg,sitar
3,image3.jpg,flutes
4,image4.jpg,bus trees


We want to see how many images without classes we have, and remove them

In [123]:
bad_rows = df[(df['Image'] == 'Image_Name') | (df['Image'] == 'Filenames')].index
df.drop(bad_rows, inplace=True, axis=0)

We noticed that there are rows in the csv that represent the header, this could mean that the original csv was constructed by sticking together a few csv files

In [124]:
classes_column = df.loc[:, 'Classes']
empty_classes = df[df['Classes'] == ' '].index
empty_classes

Index([0], dtype='int64')

Now we drop the rows that have empty string for classes

In [125]:
df.drop(empty_classes, inplace=True, axis=0)
df.head()

Unnamed: 0,Image,Classes
1,image1.jpg,bus person
2,image2.jpg,sitar
3,image3.jpg,flutes
4,image4.jpg,bus trees
5,image5.jpg,bus


We need to know what classes we have in the dataset

In [126]:
df['Classes'] = df['Classes'].str.strip()

In [127]:
unique_classes = set()

df['Classes'] = df['Classes'].str.replace(' ', ',')

df['Classes'].str.split(',').apply(unique_classes.update)

unique_classes

{'boat',
 'bus',
 'cycle',
 'desert',
 'ektara',
 'flutes',
 'harmonium',
 'motorcycle',
 'mountains',
 'person',
 'sea',
 'sitar',
 'sunset',
 'tabla',
 'trees',
 'truck'}

We see that we have 17 unique classes

In [128]:
print(len(unique_classes))

16


In [129]:
df.head()

Unnamed: 0,Image,Classes
1,image1.jpg,"bus,person"
2,image2.jpg,sitar
3,image3.jpg,flutes
4,image4.jpg,"bus,trees"
5,image5.jpg,bus


We need to make a one hot encoding for the classes 

In [130]:
# Split the 'Classes' column and expand into separate columns
df_classes = df['Classes'].str.get_dummies(',')

# Merge the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df['Image'], df_classes], axis=1)

df_encoded

Unnamed: 0,Image,boat,bus,cycle,desert,ektara,flutes,harmonium,motorcycle,mountains,person,sea,sitar,sunset,tabla,trees,truck
1,image1.jpg,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,image2.jpg,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,image3.jpg,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,image4.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,image5.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8972,image8964.img,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8973,image8965.img,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8974,image8966.img,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8975,image8967.img,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


## Image processing

Now we must work on the images because it is a missmatch between the csv data and the images in the folder

### Selecting only allowed image formats

In [131]:
images_folder = "./data/indian-images/images"
allowed_formats = ['jpeg','jpg', 'bmp', 'png']

def remove_images_in_unnacepted_format(images_folder, allowed_formats):
    for image in os.listdir(images_folder):
        image_path = os.path.join(images_folder, image)
        try: 
            tip = imghdr.what(image_path)
            if tip not in allowed_formats: 
                print('Image not in ext list {}'.format(image_path))
                os.remove(image_path)
        except Exception as e: 
            print('Issue with image {}'.format(image_path))


remove_images_in_unnacepted_format(images_folder, allowed_formats)

In the images folder we only have jpeg files. But in the csv dataset we also have jpg and img files

In [132]:
# Extract file extensions without using regex
file_extensions = df_encoded['Image'].str.split('.').str[-1].unique()

print(file_extensions)

['jpg' 'img']


In [133]:
images_with_jpg_extension = df_encoded.query("Image.str.endswith('.jpg')")
images_with_img_extension = df_encoded.query("Image.str.endswith('.img')")

print("jpg: ", len(images_with_jpg_extension))
print("img: ", len(images_with_img_extension))

jpg:  6968
img:  2000


We drop the images with img extension

In [134]:
df_encoded.drop(images_with_img_extension.index, inplace=True, axis=0)
df_encoded.tail()

Unnamed: 0,Image,boat,bus,cycle,desert,ektara,flutes,harmonium,motorcycle,mountains,person,sea,sitar,sunset,tabla,trees,truck
6971,image6964.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6972,image6965.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6973,image6966.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6974,image6967.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6975,image6968.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


We check the label distribution after we removed the rows with '.img' extension

In [135]:
df_encoded.iloc[:, 1:].sum()

boat          1586
bus            667
cycle         1465
desert           2
ektara         143
flutes         203
harmonium      241
motorcycle    1157
mountains      144
person        2279
sea           1630
sitar          244
sunset         114
tabla          771
trees         1488
truck          918
dtype: int64

We notice that the desert lable occurance is very low, and it is an outlier from the data perspective so we can remove that column

In [136]:
df_encoded.drop(columns=['desert'], inplace=True, axis=1)
df_encoded

Unnamed: 0,Image,boat,bus,cycle,ektara,flutes,harmonium,motorcycle,mountains,person,sea,sitar,sunset,tabla,trees,truck
1,image1.jpg,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2,image2.jpg,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,image3.jpg,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,image4.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
5,image5.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6971,image6964.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6972,image6965.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6973,image6966.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
6974,image6967.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


Now we removed the desert column, so no image will have that label anymore.

## Image details processing 

We want to have columns with the image width, height, color channel

In [137]:
def get_image_metadata(df_row):
    file_path = os.path.join(images_folder, df_row['Image'])
    image = Image.open(file_path)
    width, height = image.size
    channels = image.mode
    return width, height, image.format, channels

df_encoded['Image_width'], df_encoded['Image_height'], df_encoded['Format'], df_encoded['Channels'] = zip(*df_encoded.apply(get_image_metadata, axis=1))

df_encoded.head()

Unnamed: 0,Image,boat,bus,cycle,ektara,flutes,harmonium,motorcycle,mountains,person,sea,sitar,sunset,tabla,trees,truck,Image_width,Image_height,Format,Channels
1,image1.jpg,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,441,450,JPEG,RGB
2,image2.jpg,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,214,450,JPEG,RGB
3,image3.jpg,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,450,450,JPEG,RGB
4,image4.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,450,278,JPEG,RGB
5,image5.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,450,350,JPEG,RGB


We want to see what channels have our images

In [138]:
df_encoded['Channels'].value_counts()

Channels
RGB    6962
L         6
Name: count, dtype: int64

We notice that we have 6 images with L channel, we will remove them

In [139]:
bad_channel_images_filter_index = df_encoded[df_encoded['Channels'] == 'L'].index
df_encoded.drop(bad_channel_images_filter_index, inplace=True, axis=0)

df_encoded['Channels'].value_counts()

Channels
RGB    6962
Name: count, dtype: int64

We want to see the minimum and maximum height of the images from the dataset

In [140]:
df_encoded[['Image_width', 'Image_height']].describe()

Unnamed: 0,Image_width,Image_height
count,6962.0,6962.0
mean,425.042947,342.667768
std,57.739064,79.284448
min,120.0,33.0
25%,450.0,285.25
50%,450.0,337.0
75%,450.0,450.0
max,450.0,450.0


In [141]:
df_encoded.reset_index(inplace=True, drop=True)
df_encoded[['Image', 'Image_width', 'Image_height']].sort_values(by="Image_height")

Unnamed: 0,Image,Image_width,Image_height
4097,image4103.jpg,450,33
3103,image3109.jpg,450,97
4116,image4122.jpg,450,98
722,image724.jpg,450,99
4095,image4101.jpg,450,103
...,...,...,...
2969,image2975.jpg,314,450
2970,image2976.jpg,297,450
2971,image2977.jpg,258,450
2953,image2959.jpg,398,450


In [155]:
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

def show_image(images_folder, image):
    img_path = os.path.join(images_folder, image)
    img = Image.open(img_path)
    img.show()

show_image(images_folder,'image49.jpg')

We want to filter images that are in a specific range of width and height so we don't have very small images

In [146]:
lower_size_bound = 150

small_height_images = df_encoded[df_encoded['Image_height'] <= lower_size_bound].index
df_encoded.drop(small_height_images, inplace=True, axis=0)


In [149]:
small_width_images = df_encoded[df_encoded['Image_width'] <= lower_size_bound].index
df_encoded.drop(small_width_images, inplace=True, axis=0)


In [151]:
df_encoded.reset_index(inplace=True, drop=True)
df_encoded[['Image', 'Image_width', 'Image_height']].sort_values(by="Image_width")

Unnamed: 0,Image,Image_width,Image_height
4128,image4160.jpg,152,450
2735,image2751.jpg,155,450
1919,image1931.jpg,156,450
2853,image2870.jpg,156,450
4174,image4209.jpg,156,450
...,...,...,...
3234,image3254.jpg,450,335
3233,image3253.jpg,450,389
745,image753.jpg,450,423
3240,image3260.jpg,450,372


In [154]:
df_encoded

Unnamed: 0,Image,boat,bus,cycle,ektara,flutes,harmonium,motorcycle,mountains,person,sea,sitar,sunset,tabla,trees,truck,Image_width,Image_height,Format,Channels
0,image1.jpg,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,441,450,JPEG,RGB
1,image2.jpg,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,214,450,JPEG,RGB
2,image3.jpg,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,450,450,JPEG,RGB
3,image4.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,450,278,JPEG,RGB
4,image5.jpg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,450,350,JPEG,RGB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6918,image6964.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,384,450,JPEG,RGB
6919,image6965.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,413,450,JPEG,RGB
6920,image6966.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,413,450,JPEG,RGB
6921,image6967.jpg,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,288,450,JPEG,RGB


We have the following entities in the dataset:
   - musical instruments
   - people
   - vechicles
   - landscape

Both musical instruments and vehicles can be tied with people label but we see that landscape is only tied with vehicles
We can remove the landscape labels to simpify the dataset a bit

In [156]:
landscape_cols = ['mountains', 'sea', 'sunset', 'trees']

df_encoded.drop(columns=landscape_cols, inplace=True, axis=1)

df_encoded

Unnamed: 0,Image,boat,bus,cycle,ektara,flutes,harmonium,motorcycle,person,sitar,tabla,truck,Image_width,Image_height,Format,Channels
0,image1.jpg,0,1,0,0,0,0,0,1,0,0,0,441,450,JPEG,RGB
1,image2.jpg,0,0,0,0,0,0,0,0,1,0,0,214,450,JPEG,RGB
2,image3.jpg,0,0,0,0,1,0,0,0,0,0,0,450,450,JPEG,RGB
3,image4.jpg,0,1,0,0,0,0,0,0,0,0,0,450,278,JPEG,RGB
4,image5.jpg,0,1,0,0,0,0,0,0,0,0,0,450,350,JPEG,RGB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6918,image6964.jpg,0,0,0,0,0,0,0,0,0,0,1,384,450,JPEG,RGB
6919,image6965.jpg,0,0,0,0,0,0,0,0,0,0,1,413,450,JPEG,RGB
6920,image6966.jpg,0,0,0,0,0,0,0,0,0,0,1,413,450,JPEG,RGB
6921,image6967.jpg,0,0,0,0,0,0,0,0,0,0,1,288,450,JPEG,RGB


Now technically we have 2 datasets, one for musical instrumets and one for vehicles
We will split the dataset into two separate datasests: one for instruments and one for vehicles

### Musical Instruments dataset

In [158]:
vehicle_cols = ['boat', 'bus', 'cycle', 'motorcycle', 'truck']

# Find rows where any of the vehicle columns have a value of 1
rows_to_drop = df_encoded[(df_encoded[vehicle_cols] == 1).any(axis=1)].index

# Drop the rows from the DataFrame
musical_df = df_encoded.drop(rows_to_drop)

Unnamed: 0,Image,boat,bus,cycle,ektara,flutes,harmonium,motorcycle,person,sitar,tabla,truck,Image_width,Image_height,Format,Channels
1,image2.jpg,0,0,0,0,0,0,0,0,1,0,0,214,450,JPEG,RGB
2,image3.jpg,0,0,0,0,1,0,0,0,0,0,0,450,450,JPEG,RGB
5,image6.jpg,0,0,0,1,0,0,0,0,1,1,0,214,450,JPEG,RGB
6,image7.jpg,0,0,0,0,0,0,0,0,1,1,0,214,450,JPEG,RGB
7,image8.jpg,0,0,0,1,0,0,0,0,1,1,0,214,450,JPEG,RGB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2067,image2082.jpg,0,0,0,0,0,1,0,0,0,0,0,450,450,JPEG,RGB
2068,image2083.jpg,0,0,0,0,1,0,0,0,0,0,0,450,254,JPEG,RGB
2069,image2084.jpg,0,0,0,0,0,0,0,0,1,0,0,450,204,JPEG,RGB
2070,image2085.jpg,0,0,0,0,0,0,0,0,1,0,0,159,450,JPEG,RGB


In [160]:
musical_df.drop(columns=vehicle_cols, inplace=True, axis=1)
musical_df

Unnamed: 0,Image,ektara,flutes,harmonium,person,sitar,tabla,Image_width,Image_height,Format,Channels
1,image2.jpg,0,0,0,0,1,0,214,450,JPEG,RGB
2,image3.jpg,0,1,0,0,0,0,450,450,JPEG,RGB
5,image6.jpg,1,0,0,0,1,1,214,450,JPEG,RGB
6,image7.jpg,0,0,0,0,1,1,214,450,JPEG,RGB
7,image8.jpg,1,0,0,0,1,1,214,450,JPEG,RGB
...,...,...,...,...,...,...,...,...,...,...,...
2067,image2082.jpg,0,0,1,0,0,0,450,450,JPEG,RGB
2068,image2083.jpg,0,1,0,0,0,0,450,254,JPEG,RGB
2069,image2084.jpg,0,0,0,0,1,0,450,204,JPEG,RGB
2070,image2085.jpg,0,0,0,0,1,0,159,450,JPEG,RGB


### Vehicles dataset

In [161]:
musical_cols = ['ektara', 'flutes', 'harmonium', 'sitar', 'tabla']

# Find rows where any of the vehicle columns have a value of 1
rows_to_drop = df_encoded[(df_encoded[musical_cols] == 1).any(axis=1)].index

# Drop the rows from the DataFrame
vehicles_df = df_encoded.drop(rows_to_drop)

In [162]:
vehicles_df.drop(columns=musical_cols, inplace=True, axis=1)
vehicles_df

Unnamed: 0,Image,boat,bus,cycle,motorcycle,person,truck,Image_width,Image_height,Format,Channels
0,image1.jpg,0,1,0,0,1,0,441,450,JPEG,RGB
3,image4.jpg,0,1,0,0,0,0,450,278,JPEG,RGB
4,image5.jpg,0,1,0,0,0,0,450,350,JPEG,RGB
15,image16.jpg,0,1,0,0,0,0,450,387,JPEG,RGB
26,image27.jpg,0,1,0,0,0,0,450,308,JPEG,RGB
...,...,...,...,...,...,...,...,...,...,...,...
6918,image6964.jpg,0,0,0,0,0,1,384,450,JPEG,RGB
6919,image6965.jpg,0,0,0,0,0,1,413,450,JPEG,RGB
6920,image6966.jpg,0,0,0,0,0,1,413,450,JPEG,RGB
6921,image6967.jpg,0,0,0,0,0,1,288,450,JPEG,RGB


Now we have two separate dataframes and we can make classifiers for each in a separate way

## Conclusions

The dataset was very altered and we have brought it to a normal state where it can be used inside a model but there are two few images to be able to do soemthing usefull. This is just a good example on how to deal with a messed up image datase