In [37]:
# imports
import pandas as pd
import numpy as np
import time
from geopandas import GeoDataFrame
from shapely.geometry import Point
from geopy.distance import great_circle, vincenty
from datetime import datetime, timedelta
from collections import Counter
from functools import reduce
import os.path
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from scipy import spatial

In [9]:
# Read metadata and clean
metadata = pd.read_csv('/mnt/metadata.csv')
metadata['tags_clean']= metadata['tags'].str.split()
metadata = metadata.replace(np.nan, '', regex=True)

In [10]:
# Create labels from tags in the metadata
def creating_labels(x):  
    if ("nature" in str(x['tags_clean'])) or ("lake" in str(x['tags_clean'])) or ("river" in str(x['tags_clean'])) or ("view" in str(x['tags_clean'])) or ("beach" in str(x['tags_clean'])) or ("flowers" in str(x['tags_clean'])) or ("landscape" in str(x['tags_clean'])) or ("waterfall" in str(x['tags_clean'])) or ("sunrise" in str(x['tags_clean'])) or ("sunset" in str(x['tags_clean'])) or ("water" in str(x['tags_clean'])) or ("nationalpark" in str(x['tags_clean'])) or ("alaska" in str(x['tags_clean'])) or ("sky" in str(x['tags_clean'])) or ("yosemite" in str(x['tags_clean'])) or ("mountains" in str(x['tags_clean'])):
        return 'Natural Landscape'
    elif ("birds" in str(x['tags_clean'])) or ("wild" in str(x['tags_clean'])) or ("wildlife" in str(x['tags_clean'])) or ("forest" in str(x['tags_clean'])) or ("animals" in str(x['tags_clean'])) or ("zoo" in str(x['tags_clean'])):
        return 'Animals & Birds'
    elif ("food" in str(x['tags_clean'])) or ("brunch" in str(x['tags_clean'])) or ("dinner" in str(x['tags_clean'])) or ("lunch" in str(x['tags_clean'])) or ("bar" in str(x['tags_clean'])) or ("restaurant" in str(x['tags_clean'])) or ("drinking" in str(x['tags_clean'])) or ("eating" in str(x['tags_clean'])):
        return 'Food'
    elif ("urban" in str(x['tags_clean'])) or ("shop" in str(x['tags_clean'])) or ("market" in str(x['tags_clean'])) or ("square" in str(x['tags_clean'])) or ("building" in str(x['tags_clean'])) or ("citylights" in str(x['tags_clean'])) or ("cars" in str(x['tags_clean'])) or ("traffic" in str(x['tags_clean'])) or ("city" in str(x['tags_clean'])) or ("downtown" in str(x['tags_clean'])) or ("sanfrancisco" in str(x['tags_clean'])) or ("newyork" in str(x['tags_clean'])) or ("newyork" in str(x['tags_clean'])) or ("seattle" in str(x['tags_clean'])) or ("sandiego" in str(x['tags_clean'])) or ("washington" in str(x['tags_clean'])):
        return 'Urban Scenes'
    elif ("hotel" in str(x['tags_clean'])) or ("home" in str(x['tags_clean'])) or ("interior" in str(x['tags_clean'])):
        return 'Interiors'
    elif ("us" in str(x['tags_clean'])) or ("people" in str(x['tags_clean'])) or ("group" in str(x['tags_clean'])) or ("friends" in str(x['tags_clean'])):
        return 'people'
    else:
        return "Others"
metadata['labels'] = metadata.apply(creating_labels, axis=1)
metadata['labels'].value_counts()
# 6500 images cannot be categorized on the basis of tags and are thus labelled as 'others'.

Natural Landscape    7372
Others               6500
Urban Scenes         3489
people               2130
Food                  997
Animals & Birds       605
Interiors             156
Name: labels, dtype: int64

In [19]:
# Spatial clusters based on the histogram
data = metadata[['latitude', 'longitude']]
db = DBSCAN(eps = 0.06, min_samples = 5, metric ='haversine', algorithm='ball_tree')
db.fit(data)
np.unique(db.labels_, return_counts=True)

(array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
         12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
         38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
         51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
         64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
         77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
         90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
        129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
        142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
        155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176

In [20]:
metadata['dblabel'] = db.labels_
dblabel_counts = metadata.groupby(['dblabel'])['image_id'].count().reset_index()

In [21]:
list_mean = metadata.groupby(['dblabel'])['latitude', 'longitude'].mean().reset_index()
list_mean = list_mean.rename(columns = {'latitude':'mean_lat', 'longitude':'mean_long'})
metadata = metadata.merge(list_mean, left_on='dblabel', right_on='dblabel')

In [22]:
# Temporal bins
def temporal_bins(x):
    if (x['hod'] > 0 and x['hod'] <= 6):
        return 'dawn'
    elif (x['hod'] > 6 and x['hod'] <= 10):
        return 'morning'
    elif (x['hod'] > 10 and x['hod'] <= 14):
        return 'noon'
    elif (x['hod'] > 14 and x['hod'] <= 18):
        return 'dusk'
    elif (x['hod'] > 18 and x['hod'] <= 23):
        return 'night'

metadata['date_taken'] = pd.to_datetime(metadata['date_taken'])
metadata['hod'] = [r.hour for r in metadata.date_taken]
metadata['hour_bins'] = metadata.apply(temporal_bins, axis=1)

In [23]:
# No. of images based on spatial clusters, temporal binning and categories identified from tags
grouped = metadata.groupby(['labels', 'dblabel', 'hour_bins']).agg({'image_id':'count', 'views':'sum'}).reset_index()
grouped = pd.merge(grouped, dblabel_counts, left_on='dblabel', right_on='dblabel')
grouped = grouped.rename(columns = {'image_id_x':'num_images', 'image_id_y':'pts_clusters'})
grouped.head()

Unnamed: 0,labels,dblabel,hour_bins,num_images,views,pts_clusters
0,Animals & Birds,-1,dawn,8,3373,6662
1,Animals & Birds,-1,dusk,40,9176,6662
2,Animals & Birds,-1,morning,61,11591,6662
3,Animals & Birds,-1,night,9,2225,6662
4,Animals & Birds,-1,noon,96,19814,6662


In [28]:
# Get nearby categorized clusters based on a location
    #input - location and time
    #find the no. of cluster based on the location and time
    #ouput images based on categories in that cluster
def get_filtered(lat, long, time):
    point1 = (lat, long)
    metadata['lat_long'] = metadata[['latitude', 'longitude']].apply(tuple, axis=1)
    metadata['mean_lat_long'] = metadata[['mean_lat', 'mean_long']].apply(tuple, axis=1)
    metadata['distances'] = [int(great_circle(point1, point).miles) for point in metadata['mean_lat_long']]
    filtered = metadata[(metadata['distances'] <= 20) & (metadata['hour_bins'] == time)] 
    grouped = filtered.groupby(['dblabel','labels']).agg({'image_id':'count', 'views':'sum'}).reset_index()
    return filtered

get_filtered(37.7845212, -122.399388, 'morning')

Unnamed: 0.1,Unnamed: 0,accuracy,date_taken,description,image_id,latitude,longitude,owner,tags,title,...,tags_clean,labels,dblabel,mean_lat,mean_long,hod,hour_bins,lat_long,mean_lat_long,distances
9390,1065,16,2017-11-28 08:39:41,"{'_content': 'Botanical Gardens, Golden Gate P...",38691188632,37.768565,-122.469706,34368269@N04,beltedkingfisher birds botanicalgardens califo...,Belted Kingfisher,...,"[beltedkingfisher, birds, botanicalgardens, ca...",Animals & Birds,30,37.769130,-122.468124,8,morning,"(37.768565, -122.469706)","(37.7691298, -122.468123867)",3
9394,1282,16,2017-06-10 10:58:12,{'_content': ''},37959432034,37.771883,-122.468220,51035555243@N01,america bayarea california doors goldengatepar...,The Summer of Love,...,"[america, bayarea, california, doors, goldenga...",Animals & Birds,30,37.769130,-122.468124,10,morning,"(37.771883, -122.46822)","(37.7691298, -122.468123867)",3
9402,7376,16,2017-06-10 10:58:12,{'_content': ''},37959432034,37.771883,-122.468220,51035555243@N01,america bayarea california doors goldengatepar...,The Summer of Love,...,"[america, bayarea, california, doors, goldenga...",Animals & Birds,30,37.769130,-122.468124,10,morning,"(37.771883, -122.46822)","(37.7691298, -122.468123867)",3
9403,19666,16,2017-11-28 08:39:41,"{'_content': 'Botanical Gardens, Golden Gate P...",38691188632,37.768565,-122.469706,34368269@N04,beltedkingfisher birds botanicalgardens califo...,Belted Kingfisher,...,"[beltedkingfisher, birds, botanicalgardens, ca...",Animals & Birds,30,37.769130,-122.468124,8,morning,"(37.768565, -122.469706)","(37.7691298, -122.468123867)",3
9415,1819,15,2013-05-24 10:08:43,{'_content': ''},38564746082,37.814038,-122.478053,159887587@N06,,Golden-Gate-Bridge-Wallpaper-San-Francisco-Cal...,...,,Others,31,37.808078,-122.474190,10,morning,"(37.814038, -122.478053)","(37.8080783077, -122.474190385)",4
9419,1014,16,2017-11-28 09:03:11,{'_content': '2017 holiday ice rink - embarcad...,38010074574,37.795179,-122.395945,30607051@N00,bayarea california color november 2017 fall bo...,tuesday night is definitely not master's night,...,"[bayarea, california, color, november, 2017, f...",Food,32,37.788340,-122.405094,9,morning,"(37.795179, -122.395945)","(37.7883395511, -122.405094091)",0
9430,1122,16,2017-05-15 10:51:06,{'_content': ''},24836581238,37.781808,-122.405962,36521981547@N01,iphone cameraphone sf sanfrancisco flickrhq or...,Both excellent,...,"[iphone, cameraphone, sf, sanfrancisco, flickr...",Urban Scenes,32,37.788340,-122.405094,10,morning,"(37.781808, -122.405962)","(37.7883395511, -122.405094091)",0
9432,1138,16,2017-04-03 09:56:25,{'_content': ''},24831252228,37.773105,-122.422531,36521981547@N01,sf sanfrancisco iphone cameraphone hayesvalley...,Smashing Pumpkins opinions 2K17,...,"[sf, sanfrancisco, iphone, cameraphone, hayesv...",Urban Scenes,32,37.788340,-122.405094,9,morning,"(37.773105, -122.422531)","(37.7883395511, -122.405094091)",0
9434,1143,16,2017-11-27 10:59:22,"{'_content': 'page street - hayes valley, san ...",37984357514,37.774244,-122.421727,30607051@N00,sanfrancisco california night dark nikon d810 ...,international high school,...,"[sanfrancisco, california, night, dark, nikon,...",Urban Scenes,32,37.788340,-122.405094,10,morning,"(37.774244, -122.421727)","(37.7883395511, -122.405094091)",0
9436,1145,16,2017-11-27 09:05:34,{'_content': 'embarcadero center - financial d...,38702116681,37.794908,-122.397158,30607051@N00,sanfrancisco california night dark nikon d810 ...,groovy style,...,"[sanfrancisco, california, night, dark, nikon,...",Food,32,37.788340,-122.405094,9,morning,"(37.794908, -122.397158)","(37.7883395511, -122.405094091)",0


In [25]:
def radius_pts(list_pts):
    list_diff = []
    difference = 0
    for point1 in list_pts:
        for point2 in list_pts:
            difference = abs(int(great_circle(point1, point2).miles))
            list_diff.append(difference)
    diameter = max(list_diff)
    return diameter

In [26]:
x = metadata.groupby(['dblabel'])['lat_long'].apply(radius_pts)

In [None]:
data = metadata[['latitude', 'longitude']]
kdtree = spatial.KDTree(data)
kdtree.data

In [None]:
pts = [40.750277, -73.987777]
kdtree.query(pts, k=5, eps=3.0, distance_upper_bound=5.0)

In [None]:
metadata.to_csv('/mnt/flask_data.csv')