# Notebook that prepares Tartu images for competition

In [36]:
# Imports
from PIL import Image
import pandas as pd
import numpy as np
import os,sys

# Image descriptions

In [3]:
# Generate image names (e.g. img0.jpg)
img_desc = pd.read_csv('./raw_data/tartulinn/picture_descriptions.csv')
img_desc['img_name'] = [f'img{i}.jpg' for i in range(img_desc.shape[0])]

# Remove the initial image names
img_desc.pop('name')

# 'img_name' as index
img_desc.set_index('img_name', inplace=True)
img_desc

Unnamed: 0_level_0,keyword1,keyword2,keyword3,keyword4,keyword5,keyword6,keyword7,keyword8,keyword9,keyword10
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
img0.jpg,plane,people,worker,snow,winter,cone,,,,
img1.jpg,plane,people,worker,snow,winter,cone,,,,
img2.jpg,plane,people,worker,snow,winter,cone,tree,forrest,,
img3.jpg,tartu airport,building,snow,winter,road,flag,,,,
img4.jpg,tartu airport,building,snow,winter,road,flag,stone,tree,,
...,...,...,...,...,...,...,...,...,...,...
img296.jpg,The Sculpture of Kissing Students,Town Hall of Tartu,flags,,,,,,,
img297.jpg,The Sculpture of Kissing Students,fountain,building,water,,,,,,
img298.jpg,The Sculpture of Kissing Students,building,water,fountain,night,,,,,
img299.jpg,The Sculpture of Kissing Students,water,fountain,building,,,,,,


In [4]:
# Lowercase all values
for c in img_desc.columns:
    img_desc[c] = img_desc[c].str.lower()

## Gather objects into a single column

In [33]:
# Highlight missing values
img_desc.fillna('missing', inplace=True)

# Gather into a dictionary
obj_dict = {}
for i, row in img_desc.iterrows():
    objects = row.to_list()
    objects = [o for o in objects if o != 'missing']
    obj_dict[i] = objects

# Into dataframe
gathered_df = pd.DataFrame({
    'id': obj_dict.keys(),
    'objects': obj_dict.values()
    },
    )

gathered_df

Unnamed: 0,id,objects
0,img0.jpg,"[plane, people, worker, snow, winter, cone]"
1,img1.jpg,"[plane, people, worker, snow, winter, cone]"
2,img2.jpg,"[plane, people, worker, snow, winter, cone, tr..."
3,img3.jpg,"[tartu airport, building, snow, winter, road, ..."
4,img4.jpg,"[tartu airport, building, snow, winter, road, ..."
...,...,...
296,img296.jpg,"[the sculpture of kissing students, town hall ..."
297,img297.jpg,"[the sculpture of kissing students, fountain, ..."
298,img298.jpg,"[the sculpture of kissing students, building, ..."
299,img299.jpg,"[the sculpture of kissing students, water, fou..."


## Get all unique objects

In [38]:
# Long format
long_desc = img_desc.melt(
    value_vars = img_desc.columns.to_list(),
    var_name = 'keyword_number',
    value_name = 'object',
    ignore_index= False        
    )
long_desc

Unnamed: 0_level_0,keyword_number,object
img_name,Unnamed: 1_level_1,Unnamed: 2_level_1
img0.jpg,keyword1,plane
img1.jpg,keyword1,plane
img2.jpg,keyword1,plane
img3.jpg,keyword1,tartu airport
img4.jpg,keyword1,tartu airport
...,...,...
img296.jpg,keyword10,
img297.jpg,keyword10,
img298.jpg,keyword10,
img299.jpg,keyword10,


In [40]:
# All unique object names
unique_objects = list(long_desc.object.unique())
unique_objects

['plane',
 'tartu airport',
 'building',
 'sign',
 'buildings',
 'person',
 'green garbage can',
 'green garbage cans',
 'stairs',
 'grey garbage can',
 'dog',
 'white dog',
 'dogs',
 'cat',
 'crosswalk',
 'tree',
 'traffic sign',
 'leaves',
 'red bus',
 'blue electric bus',
 'red busses',
 'bus card',
 'human',
 'traffic',
 'snow',
 'trees',
 'aero photography',
 'people',
 'boy',
 'performing',
 'horses',
 'goats',
 'rabbit',
 'turkey',
 'chicken',
 'flowers',
 'rings',
 'child',
 'glasses',
 'fireworks',
 'books',
 'flags',
 'plants',
 'shelf',
 'potatoes',
 'rowan',
 'grass',
 'water',
 'pool',
 'cloudy',
 'sand',
 'coffee',
 'bridge',
 'coctails',
 'tap',
 'glass ',
 'hands',
 'stop sign',
 'traffic signs',
 'road',
 'barge',
 'cars',
 'basketball hoop',
 'fence',
 'playground',
 'railroad',
 'benches',
 'bench',
 'pavement',
 'stone wall',
 'yellow leaves',
 'the sculpture of kissing students',
 'blue ribbon',
 'roof',
 'green lazer',
 'tractor',
 'cloudy sky',
 'graffiti',
 'ham

# Test and train set

In [37]:
# Select 70% of data for training
train_df = gathered_df.sample(n = int(gathered_df.shape[0] * 0.7))

# Test data & solution
test_df = gathered_df.loc[~gathered_df.index.isin(train_df.index)]

solution_df = test_df[['id','objects']]
test_df.pop('objects')

# Sample submission
result_vals = train_df.objects.to_list()
sample_submission_df = solution_df.copy(deep=True)
sample_submission_df['objects'] = np.random.choice(result_vals, solution_df.shape[0])

  sample_submission_df['objects'] = np.random.choice(result_vals, solution_df.shape[0])


In [41]:
# Check that sample objects doesn't match other objects
sample_submission_df['objects'] == solution_df['objects']

6      False
7      False
11     False
13     False
15     False
       ...  
273    False
279    False
284    False
290    False
295    False
Name: objects, Length: 91, dtype: bool

In [42]:
# https://www.kaggle.com/community-competitions-setup-guide 
# Write data
result_path = "./prepped_data/tartulinn"

train_df.to_csv(os.path.join(result_path, 'train.csv'), index=False)
test_df.to_csv(os.path.join(result_path, 'test.csv'), index=False)
solution_df.to_csv(os.path.join(result_path, 'solution.csv'), index=False)
sample_submission_df.to_csv(os.path.join(result_path, 'sample_submission.csv'), index=False)

# List all raw images

In [29]:
# List files
img_dir = "/mnt/c/Users/krist/OneDrive/WORK/2202_Andmeteaduse taibutalgute hange/Teostus/andmetalgud/Andmed/Tartulinn/images/images"
img_files = img_desc.name
img_files[0]

'TartuLennujaam_KetlinLääts (20).JPG'

# Loop through raw images and reduce the size

In [34]:
# Resize each image
# https://pillow.readthedocs.io/en/stable/handbook/tutorial.html
for i, iname in enumerate(img_files):
    raw_img_path = os.path.join(img_dir, iname)
    new_img_path = os.path.join("./prepped_data/tartulinn/images", img_desc.img_name[i])
 
    try:
        img = Image.open(raw_img_path)
        img.thumbnail((600,600))
        img.save(new_img_path)
        print(f"Converted image {iname}")
    except:
        print(f"Couldn't convert image {iname}")

Converted image TartuLennujaam_KetlinLääts (20).JPG
Converted image TartuLennujaam_KetlinLääts (21).JPG
Converted image TartuLennujaam_KetlinLääts (22).JPG
Converted image TartuLennujaam_KetlinLääts (23).JPG
Converted image TartuLennujaam_KetlinLääts (24).JPG
Converted image TartuLennujaam_KetlinLääts (25).JPG
Converted image TartuLennujaam_KetlinLääts (26).JPG
Converted image TartuLennujaam_KetlinLääts (27).JPG
Converted image TartuLennujaam_KetlinLääts (28).JPG
Converted image TartuLennujaam_KetlinLääts (29).JPG
Converted image P2255571_JuhanVoolaid.JPG
Converted image IMG_2883JuhanVoolaid.JPG
Converted image 04 16 (JuhanVoolaid) (10).JPG
Converted image 101Z_Helle.jpeg
Converted image Jaatmevedu_Rutt.jpeg
Converted image JuhanVoolaid (89).JPG
Converted image JuhanVoolaid (90).JPG
Converted image Prügikastid (Marika Kööbi).JPG
Converted image prygikastid03LilianLukka.jpg
Converted image 18.11_Tartu_-20.JPG
Converted image 18.11_Tartu_-32.JPG
Converted image 18.11_Tartu_-34.JPG
Conver