In [1]:
from time import time
import secrets
import flickrapi
import requests
import os
import pandas as pd
import pickle
import logging


def get_photos(image_tag):

    # setup dataframe for data
    raw_photos = pd.DataFrame(columns=['latitude', 'longitude','farm','server','id','secret'])
    
    # initialize api
    flickr = flickrapi.FlickrAPI(secrets.api_key, secrets.api_secret, format='parsed-json')

    errors = ''
    try:
        # search photos based on settings
        photos = flickr.photos.search(tags=image_tag,
                                      sort='relevance',
                                      content_type=1,            #photos only
                                      extras='description,geo,url_c',
                                      has_geo=1,
                                      geo_context=2,             #outdoors
                                      per_page=100,
                                      page=1
                                      )

        # append photo details: description and getags
        raw_photos = raw_photos.append(pd.DataFrame(photos['photos']['photo'])
                                       [['latitude', 'longitude','farm','server','id','secret']],
                                       ignore_index=True)

        # construct url from pieces
        raw_photos['url'] = 'https://farm'+ raw_photos.farm.astype(str) + '.staticflickr.com/' + raw_photos.server.astype(str) + '/'+ raw_photos.id.astype(str) + '_' + raw_photos.secret.astype(str) + '.jpg'
                    
        # need a try/except here for images less than 'per page'
        print('..downloading photos')
        download_images(raw_photos, image_tag)
        
        # save data
        print('..saving metadata')
        with open('data/%s/%s.pkl' %(image_tag, image_tag), 'wb') as f:
            pickle.dump(raw_photos, f)
            f.close()
            
        del raw_photos
        
    except:
        print('Could not get info for: %s. '%image_tag)
        errors = image_tag

    return errors


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)


def download_images(df, keyword):
    path = ''.join(['data/',keyword])
    create_folder(path)

    print('...df length: %d' %len(df.index))
    print('...going through each row of dataframe')
    for idx, row in df.iterrows():
        try:
            image_path = ''.join([path,'/',row.id,'.jpg'])
            response = requests.get(row.url)#, stream=True)

            with open(image_path, 'wb') as outfile:
                outfile.write(response.content)
                outfile.close()
                
        except:
            print('...Error occured at idx: %d'%idx)

    print('...download completed.')

In [2]:
places = pd.read_csv('IndoorOutdoor_places205.csv', names=['key','label'])

In [3]:
places.head()

Unnamed: 0,key,label
0,/a/abbey',2
1,/a/airport_terminal',1
2,/a/alley',2
3,/a/amphitheater',2
4,/a/amusement_park',2


In [4]:
# retrieve all outdoor scene categories. We clean up the 'key' column, remove duplicates, and re-index the dataframe.
places['key'] = places['key'].str[3:].str.split('/',1,expand=True)
places = places[places.label == 2]
places = places.drop_duplicates(ignore_index=True)
places['key'] = places['key'].str.strip('\'')
places['key'] = places['key'].replace(to_replace='_',value=' ',regex=True)
places.head(-20)

Unnamed: 0,key,label
0,abbey,2
1,alley,2
2,amphitheater,2
3,amusement park,2
4,aqueduct,2
...,...,...
107,skyscraper,2
108,slum,2
109,snowfield,2
110,swamp,2


In [5]:
places.count() #should have 132

key      132
label    132
dtype: int64

In [6]:
errors = []
for idx, row in places.iterrows():

    # change this idx when it crashes. It will give an error for a few indices. It probably means Flickr does not have 
    # geotagged images for these keywords. We skip over those. Should have a total of 130 keywords at the end.
    if idx < 0:
        pass
    else:
        start = time()
        error = get_photos(row.key)
        end = time()
        print('%20s in %.2e seconds.' %(row.key, end-start)) # should vary between 3-8 seconds depending on the keyword.
        
        if error != '':
            errors.append(error)

..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
               abbey in 1.23e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
               alley in 1.18e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
        amphitheater in 1.27e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
      amusement park in 1.28e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
            aqueduct in 1.29e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
                arch in 1.47e+01 seconds.
..downloading photos
...df length:

..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
         golf course in 1.58e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...Error occured at idx: 15
...Error occured at idx: 91
...download completed.
..saving metadata
              harbor in 2.03e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
         herb garden in 1.77e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
             highway in 1.68e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...Error occured at idx: 58
...download completed.
..saving metadata
            hospital in 2.14e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...Error occured at idx: 36
...do

...download completed.
..saving metadata
              runway in 1.56e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
             sandbar in 1.52e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
         schoolhouse in 1.85e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
           sea cliff in 1.53e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...Error occured at idx: 42
...download completed.
..saving metadata
                shed in 2.33e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...download completed.
..saving metadata
           shopfront in 1.50e+01 seconds.
..downloading photos
...df length: 100
...going through each row of dataframe
...dow

In [7]:
# we test loading the pickle file.
keyword = 'basilica'
with open('data/%s/%s.pkl' %(keyword,keyword), 'rb') as f:
    test = pickle.load(f)
    f.close()

In [8]:
test.head()

Unnamed: 0,latitude,longitude,farm,server,id,secret,url
0,41.006141,28.97733,66,65535,49957630107,74482549bd,https://farm66.staticflickr.com/65535/49957630...
1,41.902261,12.453346,66,65535,49946502777,a97fc528b6,https://farm66.staticflickr.com/65535/49946502...
2,44.416493,12.201175,66,65535,49937948567,af566914f8,https://farm66.staticflickr.com/65535/49937948...
3,41.009966,28.97808,66,65535,49935911456,5b2d72ea0a,https://farm66.staticflickr.com/65535/49935911...
4,43.067085,12.625565,66,65535,49916045253,f49219b8ca,https://farm66.staticflickr.com/65535/49916045...


In [9]:
# we test loading the image.
from PIL import Image

image = Image.open('data/%s/%s.jpg'%(keyword,test.id[0]))
image.show()