In [1]:
from time import time
import config
import flickrapi
import requests
import os
import pandas as pd
import pickle
import logging


def get_photos(image_tag):

    # setup dataframe for data
    raw_photos = pd.DataFrame(columns=['latitude', 'longitude','farm','server','id','secret'])
    
    # initialize api
    flickr = flickrapi.FlickrAPI(config.api_key, config.api_secret, format='parsed-json')

    # search photos based on settings
    photos = flickr.photos.search(tags=image_tag,
                                  sort='relevance',
                                  content_type=1,            #photos only
                                  extras='description,geo,url_c',
                                  has_geo=1,
                                  geo_context=2,             #outdoors
                                  per_page=20,
                                  page=1
                                  )
       
    # append photo details: description and getags
    raw_photos = raw_photos.append(pd.DataFrame(photos['photos']['photo'])
                                   [['latitude', 'longitude','farm','server','id','secret']],
                                   ignore_index=True)
    
    # construct url from pieces
    raw_photos['url'] = 'https://farm'+ raw_photos.farm.astype(str) + '.staticflickr.com/' + raw_photos.server.astype(str) + '/'+ raw_photos.id.astype(str) + '_' + raw_photos.secret.astype(str) + '.jpg'
    download_images(raw_photos, image_tag)
    
    # save data
    with open('data/%s/%s.pkl' %(image_tag, image_tag), 'wb') as f:
        pickle.dump(raw_photos, f)

    return photos


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)


def download_images(df, keyword):
    path = ''.join(['data/',keyword])
    create_folder(path)

    for _, row in df.iterrows():
        image_path = ''.join([path,'/',row.id,'.jpg'])
        response = requests.get(row.url, stream=True)

        with open(image_path, 'wb') as outfile:
            outfile.write(response.content)

    logging.info('download completed.')

In [2]:
places = pd.read_csv('IndoorOutdoor_places205.csv', names=['key','label'])

In [3]:
places.head()

Unnamed: 0,key,label
0,/a/abbey',2
1,/a/airport_terminal',1
2,/a/alley',2
3,/a/amphitheater',2
4,/a/amusement_park',2


In [4]:
# retrieve all outdoor scene categories. We clean up the 'key' column, remove duplicates, and re-index the dataframe.
places['key'] = places['key'].str[3:].str.split('/',1,expand=True)
places = places[places.label == 2]
places = places.drop_duplicates(ignore_index=True)
places['key'] = places['key'].str.strip('\'')
places['key'] = places['key'].replace(to_replace='_',value=' ',regex=True)
places.head(-20)

Unnamed: 0,key,label
0,abbey,2
1,alley,2
2,amphitheater,2
3,amusement park,2
4,aqueduct,2
...,...,...
107,skyscraper,2
108,slum,2
109,snowfield,2
110,swamp,2


In [5]:
places.count() #should have 132

key      132
label    132
dtype: int64

In [6]:
for idx, row in places.iterrows():

    # change this idx when it crashes. It will give an error for a few indices. It probably means Flickr does not have 
    # geotagged images for these keywords. We skip over those. Should have a total of 130 keywords at the end.
    if idx < 128:
        pass
    else:
        start = time()
        get_photos(row.key)
        end = time()
        print('%20s in %.2e seconds.' %(row.key, end-start)) # should vary between 3-8 seconds depending on the keyword.

         wheat field in 7.84e+00 seconds.
           wind farm in 6.13e+00 seconds.
            windmill in 3.77e+00 seconds.
                yard in 4.74e+00 seconds.


In [10]:
# we test loading the pickle file.
keyword = 'snowfield'
with open('data/%s/%s.pkl' %(keyword,keyword), 'rb') as f:
    test = pickle.load(f)
    f.close()

In [11]:
test.head()

Unnamed: 0,image,latitude,longitude,farm,server,id,secret,url
0,,43.360289,-4.856986,66,65535,49937358838,44ddcfcd0e,https://farm66.staticflickr.com/65535/49937358...
1,,42.628764,0.639996,66,65535,49925308058,28d78a2c9c,https://farm66.staticflickr.com/65535/49925308...
2,,45.861222,7.937355,66,65535,49885707777,ec0142d5f0,https://farm66.staticflickr.com/65535/49885707...
3,,46.589953,12.340908,66,65535,49878327498,0fa44cbf93,https://farm66.staticflickr.com/65535/49878327...
4,,62.319984,9.266849,66,65535,49816600896,0b3fc21452,https://farm66.staticflickr.com/65535/49816600...


In [16]:
# we test loading the image.
from PIL import Image

image = Image.open('data/%s/%s.jpg'%(keyword,test.id[0]))
image.show()

In [1]:
import os

subdirs = [x[0] for x in os.walk('data')]

In [2]:
subdirs

['data',
 'data\\abbey',
 'data\\alley',
 'data\\amphitheater',
 'data\\amusement park',
 'data\\apartment building',
 'data\\aqueduct',
 'data\\arch',
 'data\\badlands',
 'data\\bamboo forest',
 'data\\baseball field',
 'data\\basilica',
 'data\\boardwalk',
 'data\\boat deck',
 'data\\botanical garden',
 'data\\bridge',
 'data\\building facade',
 'data\\butte',
 'data\\campsite',
 'data\\canyon',
 'data\\castle',
 'data\\cathedral',
 'data\\cemetery',
 'data\\chalet',
 'data\\church',
 'data\\coast',
 'data\\construction site',
 'data\\corn field',
 'data\\cottage garden',
 'data\\courthouse',
 'data\\courtyard',
 'data\\creek',
 'data\\crevasse',
 'data\\crosswalk',
 'data\\dam',
 'data\\desert',
 'data\\dock',
 'data\\doorway',
 'data\\driveway',
 'data\\excavation',
 'data\\fairway',
 'data\\field',
 'data\\fire escape',
 'data\\fire station',
 'data\\forest path',
 'data\\forest road',
 'data\\formal garden',
 'data\\fountain',
 'data\\garbage dump',
 'data\\gas station',
 'data\\

In [6]:
classes = [text[5:] for text in subdirs]

In [8]:
classes.pop(0)

''

In [9]:
classes

['abbey',
 'alley',
 'amphitheater',
 'amusement park',
 'apartment building',
 'aqueduct',
 'arch',
 'badlands',
 'bamboo forest',
 'baseball field',
 'basilica',
 'boardwalk',
 'boat deck',
 'botanical garden',
 'bridge',
 'building facade',
 'butte',
 'campsite',
 'canyon',
 'castle',
 'cathedral',
 'cemetery',
 'chalet',
 'church',
 'coast',
 'construction site',
 'corn field',
 'cottage garden',
 'courthouse',
 'courtyard',
 'creek',
 'crevasse',
 'crosswalk',
 'dam',
 'desert',
 'dock',
 'doorway',
 'driveway',
 'excavation',
 'fairway',
 'field',
 'fire escape',
 'fire station',
 'forest path',
 'forest road',
 'formal garden',
 'fountain',
 'garbage dump',
 'gas station',
 'golf course',
 'harbor',
 'herb garden',
 'highway',
 'hospital',
 'hot spring',
 'hotel',
 'ice skating rink',
 'iceberg',
 'igloo',
 'inn',
 'islet',
 'kasbah',
 'lighthouse',
 'mansion',
 'market',
 'marsh',
 'mausoleum',
 'medina',
 'monastery',
 'motel',
 'mountain',
 'ocean',
 'office building',
 'orch

In [11]:
len(classes)

130

In [13]:
import pickle
with open('flikr_class_list.pkl', 'wb') as f:
    pickle.dump(classes, f)