# Notebook that prepares Tartu images for competition

In [10]:
# Imports
from PIL import Image
import pandas as pd
import numpy as np
import os,sys

# Image descriptions

In [11]:
# Generate image names (e.g. img0.jpg)
img_desc = pd.read_csv('./raw_data/tartulinn/picture_descriptions.csv')
img_desc['img_name'] = [f'img{i}.jpg' for i in range(img_desc.shape[0])]

# Remove the initial image names
img_desc.pop('name')

# 'img_name' as index
img_desc.set_index('img_name', inplace=True)

# Highlight missing values
img_desc.fillna('missing', inplace=True)
img_desc

Unnamed: 0_level_0,keyword1,keyword2,keyword3,keyword4,keyword5,keyword6,keyword7,keyword8,keyword9,keyword10
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
img0.jpg,plane,people,worker,snow,winter,cone,missing,missing,missing,missing
img1.jpg,plane,people,worker,snow,winter,cone,missing,missing,missing,missing
img2.jpg,plane,people,worker,snow,winter,cone,tree,forrest,missing,missing
img3.jpg,tartu airport,building,snow,winter,road,flag,missing,missing,missing,missing
img4.jpg,tartu airport,building,snow,winter,road,flag,stone,tree,missing,missing
...,...,...,...,...,...,...,...,...,...,...
img296.jpg,The Sculpture of Kissing Students,Town Hall of Tartu,flags,missing,missing,missing,missing,missing,missing,missing
img297.jpg,The Sculpture of Kissing Students,fountain,building,water,missing,missing,missing,missing,missing,missing
img298.jpg,The Sculpture of Kissing Students,building,water,fountain,night,missing,missing,missing,missing,missing
img299.jpg,The Sculpture of Kissing Students,water,fountain,building,missing,missing,missing,missing,missing,missing


In [27]:
# Some data cleaning
for c in img_desc.columns:
    # Lowercase everything
    img_desc[c] = img_desc[c].str.lower()
    # Remove leading and trailing spaces
    img_desc[c] = img_desc[c].str.strip()
    # Replace bicylce and bicycles with bicycle
    img_desc[c] = img_desc[c].str.replace("bicylce", "bicycle")


## Long format of data

In [28]:
# Long format
long_desc = img_desc.melt(
    value_vars = img_desc.columns.to_list(),
    var_name = 'label',
    value_name = 'object',
    ignore_index= False        
    )
long_desc.pop('label')
long_desc

Unnamed: 0_level_0,object
img_name,Unnamed: 1_level_1
img0.jpg,plane
img1.jpg,plane
img2.jpg,plane
img3.jpg,tartu airport
img4.jpg,tartu airport
...,...
img296.jpg,missing
img297.jpg,missing
img298.jpg,missing
img299.jpg,missing


In [30]:
# The count of labels
labels_count = long_desc.groupby(by='object').size().sort_values(ascending=False)

# Keep only labels with at least 4 instances
labs_to_keep = labels_count[labels_count >= 4]

# Remove the label 'missing'
labs_to_keep.pop('missing')

# Add names to labels
label_dict = {f"l{i}":obj for i,obj in enumerate(labs_to_keep.keys())}
label_df = pd.DataFrame({
    'label_id': label_dict.keys(),
    'object': label_dict.values()
})
label_df

Unnamed: 0,label_id,object
0,l0,people
1,l1,trees
2,l2,grass
3,l3,building
4,l4,snow
...,...,...
87,l87,stage
88,l88,book
89,l89,books
90,l90,flag


In [31]:
# Long dataframe that includes only labels that have several instances
rows_to_keep = [l in labs_to_keep for l in long_desc.object]
keep_df = long_desc[rows_to_keep]

In [32]:
# Reset index
keep_df.reset_index(inplace=True)

# Add label IDs
keep_df = keep_df.merge(label_df, on='object')
keep_df

Unnamed: 0,img_name,object,label_id
0,img0.jpg,plane,l63
1,img1.jpg,plane,l63
2,img2.jpg,plane,l63
3,img5.jpg,plane,l63
4,img9.jpg,plane,l63
...,...,...,...
1198,img4.jpg,flag,l90
1199,img0.jpg,cone,l79
1200,img1.jpg,cone,l79
1201,img2.jpg,cone,l79


In [33]:
# Add 1s for values
keep_df = keep_df.assign(value = 1)

# Turn it into wide dataframe
keep_df_wide = keep_df.pivot_table(index='img_name', columns='label_id', values='value', fill_value=0)
keep_df_wide

label_id,l0,l1,l10,l11,l12,l13,l14,l15,l16,l17,...,l83,l84,l85,l86,l87,l88,l89,l9,l90,l91
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
img0.jpg,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
img1.jpg,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
img10.jpg,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
img100.jpg,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
img101.jpg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
img95.jpg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
img96.jpg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
img97.jpg,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
img98.jpg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Gather objects into a single column

In [34]:
# Gather into a dictionary
obj_dict = {}
for i, row in keep_df_wide.iterrows():
    labels = keep_df_wide.columns
    mystring = ""
    for elem, lab in enumerate(row):
        if lab:
            mystring += " " + str(labels[elem])
    mystring = mystring.strip()
    obj_dict[i] = mystring

# Into dataframe
gathered_df = pd.DataFrame({
    'image_id': obj_dict.keys(),
    'labels': obj_dict.values()
    },
    )

gathered_df

Unnamed: 0,image_id,labels
0,img0.jpg,l0 l17 l4 l58 l63 l79
1,img1.jpg,l0 l17 l4 l58 l63 l79
2,img10.jpg,l0 l1 l17 l4 l5 l7
3,img100.jpg,l0 l1 l24 l26 l39 l41 l5
4,img101.jpg,l0 l24 l26 l39 l41 l5
...,...,...
288,img95.jpg,l0 l87
289,img96.jpg,l0 l24 l87
290,img97.jpg,l0 l15 l24 l26 l39
291,img98.jpg,l0 l24 l26 l39


# TEST AND TRAIN SET

## id + list of labels
Test and train data represented as one row per image and the column of labels list.

In [35]:
# Select 70% of data for training
train_df = gathered_df.sample(n = int(gathered_df.shape[0] * 0.7))

# Test data & solution
test_df = gathered_df.loc[~gathered_df.index.isin(train_df.index)]

solution_df = test_df.copy(deep=True)
test_df.pop('labels')

# Sample submission
result_vals = train_df['labels'].to_list()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['labels'] = np.random.choice(result_vals, solution_df.shape[0])

In [36]:
# Check that sample objects doesn't match other objects
sample_submission_df['labels'] == solution_df['labels']

5      False
6      False
12     False
16     False
17     False
       ...  
273    False
276    False
283    False
286    False
287    False
Name: labels, Length: 88, dtype: bool

In [37]:
# https://www.kaggle.com/community-competitions-setup-guide 
# Write data
result_path = "./prepped_data/tartulinn"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)

# Labels
label_df.to_csv(os.path.join(result_path, 'labels.csv'), index=False)