In [56]:
from tqdm.notebook import tqdm
tqdm.pandas()

import requests
import zipfile
import io
import os
import shutil
import json
import csv
import random
import tarfile

import torch
import torchvision
import torchvision.transforms as transforms

from matplotlib import pyplot as plt
from PIL import Image
import cv2
import numpy as np
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [3]:
DATA_DIR= "/home/ubuntu/dataset"
IMAGES_DIR= DATA_DIR + "/images"
TRAIN_DIR= DATA_DIR + "/train"
VAL_DIR= DATA_DIR + "/val"

BASELINE_DIR = "/home/ubuntu/baseline"
BASELINE_IMAGES_DIR = BASELINE_DIR + "/images"

if not os.path.isdir(DATA_DIR): 
    os.mkdir(DATA_DIR)

## Filter to only outdoor day images

In [6]:
print("Setting up image directory")

outday_count = 0

if not os.path.isdir(IMAGES_DIR): 
    os.mkdir(IMAGES_DIR)
    
with open(DATA_DIR + "/labels-baseline-outdoor_indoor.csv") as meta_file:
    rows = csv.reader(meta_file)
    
    print("Moving outdoor day class labelled images to images directory from large image dataset")
    
    for r,row in tqdm(enumerate(rows),total=len(rows)):
        if (r == 0):
            id_index = row.index("Image")
            label_index = row.index("Label")
        elif row[label_index] == "1": 
            file = row[id_index]
            file_name = os.path.join(BASELINE_IMAGES_DIR, file)
            if os.path.exists(file_name):
                outday_count += 1
                shutil.copy(file_name, IMAGES_DIR)
                
print(f"{outday_count} images copied to dataset")
print("Finished setting up image directory")
            

Setting up image directory
Moving outdoor day class labelled images to images directory from large image dataset


621445it [09:33, 1084.24it/s]

347405 images copied to dataset
Finished setting up image directory





## Merge outdoor labelled metadata w/ geo

*Time consuming!* Photo metadata is very large and must be joined. Try to limit runs.

In [41]:
with open(DATA_DIR + "/spatial-index/Tile-Quadtree-Regions.csv") as geo_file:
    geo = [{k.strip(): float(v) for k, v in row.items()} for row in csv.DictReader(geo_file, skipinitialspace=True) if row]

labels_df = pd.read_csv(DATA_DIR + "/labels-baseline-outdoor_indoor.csv")
meta_outdoor = labels_df[labels_df["Label"] == 1]
meta_outdoor["id"] = meta_outdoor["Image"].str.replace('.jpg','').astype("int64")
# display(meta_outdoor)

  meta_outdoor["id"] = meta_outdoor["Image"].str.replace('.jpg','').astype("int64")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_outdoor["id"] = meta_outdoor["Image"].str.replace('.jpg','').astype("int64")


In [None]:
meta_df = pd.read_csv(DATA_DIR + "/photo_metadata.csv")

In [42]:
meta_all = meta_outdoor.merge(meta_df,how="inner",on="id")

Unnamed: 0.1,Unnamed: 0,Image,Label,id,userid,title,tags,latitude,longitude,views,date_taken,date_uploaded,accuracy,flickr_secret,flickr_server,flickr_farm,x,y,z
0,0,1127618657.jpg,1.0,1127618657,50166674@N00,gone fishin',{},44.993454,-73.362836,7,2003-01-01 00:03:17,2007-08-15 16:39:24,13,0c743a54c5,1411,2,1.293578e+06,-4.328964e+06,4.486834e+06
1,2,4571931357.jpg,1.0,4571931357,42362746@N03,Congo kivu violences handicapés,"{voyage,africa,war,echo,danse,chapeau,violence...",-2.506609,28.838424,261,2010-03-22 13:18:29,2010-05-02 21:15:11,12,ed7a2a76fd,4056,5,5.581832e+06,3.073515e+06,-2.770798e+05
2,3,1433580947.jpg,1.0,1433580947,80155366@N00,Sydney Opera,"{architecture,opera,sydney,australia,cbd,thero...",-33.870415,151.217880,28,2006-10-01 15:12:41,2007-09-24 18:10:29,12,95a84772e6,1053,2,-4.646346e+06,2.552463e+06,-3.534521e+06
3,8,1352211464.jpg,1.0,1352211464,11539320@N08,Living in a box 1,{},52.414042,4.873809,25,2007-09-09 14:21:06,2007-09-09 20:05:38,14,04851d0d7a,1023,2,3.884458e+06,3.312269e+05,5.031036e+06
4,10,1190450959.jpg,1.0,1190450959,11854998@N08,"Pilatus, Switzerland, 2006-05-12","{switzerland,pilatus,perfectpanoramas}",46.980135,8.253822,91,2007-08-20 23:53:53,2007-08-21 06:53:53,12,ad5e089f25,1434,2,4.314148e+06,6.258155e+05,4.640258e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281369,621428,1356734502.jpg,1.0,1356734502,10101954@N05,Farm House on Furka,"{farmhouse,alpes,landscape,switzerland,furka,a...",46.587977,8.477926,186,2007-09-10 15:49:32,2007-09-10 20:05:38,13,243d5c3234,1069,2,4.343091e+06,6.473688e+05,4.610407e+06
281370,621430,120624118.jpg,1.0,120624118,49503178415@N01,Grass,"{turkey,me2,ankara,metu,odtü,middleeasttechnic...",39.894781,32.783347,187,2006-03-29 13:17:05,2006-03-31 07:17:05,16,f1cf08f929,35,1,4.119723e+06,2.653286e+06,4.069029e+06
281371,621434,46167242.jpg,1.0,46167242,88025703@N00,Metropolitan Museum Img_0485,"{sculpture,art,statue,museum,architecture,godd...",40.774106,-73.966398,264,2005-08-18 14:55:24,2005-09-24 19:37:04,13,7df2a38da0,24,1,1.335988e+06,-4.648851e+06,4.143458e+06
281372,621436,124651571.jpg,1.0,124651571,11968973@N00,Fuji-san,"{japan,shinkansen,mtfuji}",35.363505,138.729686,44,2006-04-06 20:00:18,2006-04-07 12:26:16,12,e1eeeca864,51,1,-3.913756e+06,3.434727e+06,3.670829e+06


In [60]:
meta_all.to_csv(f"{DATA_DIR}/meta_all.csv")

In [76]:
# def get_grid(row):
#     grid = next(filter(lambda g: row['latitude'] >= g["Min Latitude"] and row['latitude'] <= g["Max Latitude"] and row['longitude'] >= g["Min Longitude"] and row['longitude'] <= g["Max Longitude"], geo))
#     return str(int(grid["Quad-Tree Region ID"]))

# meta_all["grid"] = meta_all.progress_apply(get_grid ,axis=1)
# display(meta_all)

# region_counts = {}

meta_all["region"] = ''

#Loading from file is significantly more performant than Pandas apply
with open(DATA_DIR + "/meta_all.csv") as meta_file:
    rows = csv.reader(meta_file)
    
    print("Labeling images with map regions")
    
    for r,row in tqdm(enumerate(rows),total=len(meta_all)):
        if (r == 0):
            id_index = row.index("id")
            lat_index = row.index("latitude")
            lng_index = row.index("longitude")
            
        else: 
                
            lat = float(row[lat_index])
            lng = float(row[lng_index])

            grid = next(filter(lambda g: lat >= g["Min Latitude"] and lat <= g["Max Latitude"] and lng >= g["Min Longitude"] and lng <= g["Max Longitude"], geo),None)

            if grid:
                
                meta_all.loc[meta_all['id'] == int(row[id_index]),'region'] = str(int(grid["Quad-Tree Region ID"]))

                
#                 if region not in region_counts:
#                     region_counts[region] = 0
                
#                 region_counts[region] += 1

display(meta_all)

Labeling images with map regions


  0%|          | 0/281374 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,Image,Label,id,userid,title,tags,latitude,longitude,views,date_taken,date_uploaded,accuracy,flickr_secret,flickr_server,flickr_farm,x,y,z,region
0,0,1127618657.jpg,1.0,1127618657,50166674@N00,gone fishin',{},44.993454,-73.362836,7,2003-01-01 00:03:17,2007-08-15 16:39:24,13,0c743a54c5,1411,2,1.293578e+06,-4.328964e+06,4.486834e+06,763
1,2,4571931357.jpg,1.0,4571931357,42362746@N03,Congo kivu violences handicapés,"{voyage,africa,war,echo,danse,chapeau,violence...",-2.506609,28.838424,261,2010-03-22 13:18:29,2010-05-02 21:15:11,12,ed7a2a76fd,4056,5,5.581832e+06,3.073515e+06,-2.770798e+05,8
2,3,1433580947.jpg,1.0,1433580947,80155366@N00,Sydney Opera,"{architecture,opera,sydney,australia,cbd,thero...",-33.870415,151.217880,28,2006-10-01 15:12:41,2007-09-24 18:10:29,12,95a84772e6,1053,2,-4.646346e+06,2.552463e+06,-3.534521e+06,640
3,8,1352211464.jpg,1.0,1352211464,11539320@N08,Living in a box 1,{},52.414042,4.873809,25,2007-09-09 14:21:06,2007-09-09 20:05:38,14,04851d0d7a,1023,2,3.884458e+06,3.312269e+05,5.031036e+06,1911
4,10,1190450959.jpg,1.0,1190450959,11854998@N08,"Pilatus, Switzerland, 2006-05-12","{switzerland,pilatus,perfectpanoramas}",46.980135,8.253822,91,2007-08-20 23:53:53,2007-08-21 06:53:53,12,ad5e089f25,1434,2,4.314148e+06,6.258155e+05,4.640258e+06,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281369,621428,1356734502.jpg,1.0,1356734502,10101954@N05,Farm House on Furka,"{farmhouse,alpes,landscape,switzerland,furka,a...",46.587977,8.477926,186,2007-09-10 15:49:32,2007-09-10 20:05:38,13,243d5c3234,1069,2,4.343091e+06,6.473688e+05,4.610407e+06,1241
281370,621430,120624118.jpg,1.0,120624118,49503178415@N01,Grass,"{turkey,me2,ankara,metu,odtü,middleeasttechnic...",39.894781,32.783347,187,2006-03-29 13:17:05,2006-03-31 07:17:05,16,f1cf08f929,35,1,4.119723e+06,2.653286e+06,4.069029e+06,805
281371,621434,46167242.jpg,1.0,46167242,88025703@N00,Metropolitan Museum Img_0485,"{sculpture,art,statue,museum,architecture,godd...",40.774106,-73.966398,264,2005-08-18 14:55:24,2005-09-24 19:37:04,13,7df2a38da0,24,1,1.335988e+06,-4.648851e+06,4.143458e+06,1578
281372,621436,124651571.jpg,1.0,124651571,11968973@N00,Fuji-san,"{japan,shinkansen,mtfuji}",35.363505,138.729686,44,2006-04-06 20:00:18,2006-04-07 12:26:16,12,e1eeeca864,51,1,-3.913756e+06,3.434727e+06,3.670829e+06,562


In [121]:
meta_all.to_csv(f"{DATA_DIR}/meta_all.csv")

## Filter by region

In [130]:
print("Highest region class: " + str(meta_all[meta_all['region'] != '']['region'].astype(int).max()))

top_regions = meta_all.groupby('region').count().sort_values(ascending=False,by='id')

print(top_regions['id'])

print(top_regions[top_regions['id'] >= 100]['id'])

print(top_regions[top_regions['id'] >= 100]['id'].sum())

top_regions = top_regions[top_regions['id'] >= 100]['id']
display(top_regions)

Highest region class: 3592
region
84      948
17      920
27      895
15      863
18      846
       ... 
3245      1
2630      1
2631      1
3239      1
          1
Name: id, Length: 2848, dtype: int64
region
84      948
17      920
27      895
15      863
18      846
       ... 
1376    100
617     100
797     100
541     100
650     100
Name: id, Length: 785, dtype: int64
229968


region
84      948
17      920
27      895
15      863
18      846
       ... 
1376    100
617     100
797     100
541     100
650     100
Name: id, Length: 785, dtype: int64

Index(['84', '17', '27', '15', '18', '1', '13', '4', '26', '31',
       ...
       '714', '1096', '947', '1363', '784', '1376', '617', '797', '541',
       '650'],
      dtype='object', name='region', length=785)


## Split images into train and val datasets

Run once or only if you want to re-split the dataset into train/val sets

In [133]:
print("Setting up train/val class directories")

if not os.path.isdir(TRAIN_DIR): 
    os.mkdir(TRAIN_DIR)
    os.mkdir(VAL_DIR)

random.seed(1)
train_split = .8
    
train_count = 0
val_count = 0
    
with open(DATA_DIR + "/meta_all.csv") as meta_file:
    rows = csv.reader(meta_file)
    
    print("Moving images to train/val directories")
    
    for r,row in tqdm(enumerate(rows),total=len(meta_all)):
        if (r == 0):
            id_index = row.index("id")
            region_index = row.index("region")
        else: 
            file = row[id_index] + ".jpg"
            file_name = os.path.join(IMAGES_DIR, file)
        
            if os.path.exists(file_name):
                
                split_index = random.uniform(0, 1)
                
                region = row[region_index]

                if region in top_regions.index:

                    if split_index <= train_split:
                        class_dir = TRAIN_DIR + "/" + region
                        train_count += 1
                    else:
                        class_dir = VAL_DIR + "/" + region
                        val_count += 1

                    if not os.path.isdir(class_dir):
                        os.mkdir(class_dir)
                        
                    shutil.move(file_name, class_dir)
                
print(f"Moved {train_count} images to train directories and {val_count} images to val directories ({train_count + val_count} total)")
print("Finished setting up train/val class directories")
            

Setting up train/val class directories
Moving images to train/val directories


  0%|          | 0/281374 [00:00<?, ?it/s]

Moved 183645 images to train directories and 46323 images to val directories (229968 total)
Finished setting up train/val class directories
