# Collect photos from Flickr based on zip code coordinates and clean data

In [1]:
import flickrapi
import requests
import datetime
from time import time, mktime
import csv
import pandas as pd

In [2]:
api_key = ''
api_secret = ''
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

In [3]:
# load the file 
import pickle
with open('zipcode_final.txt', 'rb') as f:
    zip_codes = pickle.load(f)    

# 1. Get lat-lng coordinates by zipcode (Google Maps API)

In [39]:
# Google Maps API
GOOGLE_KEY = ''

# Convert zip codes to coordinates
lats2 = []
lngs2 = []
for zip_code in zip_codes:
    query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s' % (zip_code, GOOGLE_KEY)
    r = requests.get(query_url)    
    temp = r.json()
    if len(temp['results'])==0:
        lat = 'none'
        lng = 'none'    
    else:
        lat = temp['results'][0]['geometry']['location']['lat']
        lng = temp['results'][0]['geometry']['location']['lng']
    lats2.append(lat)
    lngs2.append(lng)

# 1.1 Use this table instead (*Google Maps API has rate limit)

In [5]:
# zip code -> coordinate file
df_ = pd.read_csv('zip_coordinate.csv')
df_.head()

Unnamed: 0,ZIP,LAT,LNG
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.455183,-67.119887
3,606,18.158345,-66.932911
4,610,18.295366,-67.125135


In [7]:
# Convert zip codes to coordinates
lats = []
lngs = []
for zip_code in zip_codes:
    if int(zip_code) in df_.ZIP.values:
        temp = df_[df_.ZIP == int(zip_code)]
        lats.append(temp.LAT)
        lngs.append(temp.LNG)        

# 2. Get # of art photos per year by zip code (< 1km)

In [158]:
import datetime
from time import time, mktime
years = range(2010,2013)

df_all = pd.DataFrame()
df_art = pd.DataFrame()

# for year in years:
year = 2011
start_date = datetime.datetime(year, 1, 1, 0, 0)
start_stamp = int(mktime(start_date.timetuple()))
end_date = datetime.datetime(year+1, 1, 1, 0, 0)
end_stamp = int(mktime(end_date.timetuple()))

for (lat, lng, zip_code) in zip(lats, lngs, zip_codes):
    # all photos
    r_all = flickr.photos_search(sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=1, min_upload_date=start_stamp, max_upload_date=end_stamp)
    # art photo
    r_art = flickr.photos_search(text=['art'], tags_mode='any',sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=1, min_upload_date=start_stamp, max_upload_date=end_stamp)

    data_all = [dict(total = r_all['photos']['total'],
                year = year,
                zip_code = zip_code,
                lat = lat,
                lng = lng)]        

    data_art = [dict(total = r_art['photos']['total'],
                year = year,
                zip_code = zip_code,
                lat = lat,
                lng = lng)]        

    df_all_temp = pd.DataFrame(data_all)
    df_art_temp = pd.DataFrame(data_art)

    df_all = pd.concat([df_all,df_all_temp],ignore_index=True)
    df_art = pd.concat([df_art,df_art_temp],ignore_index=True)        

In [159]:
df_all_2011 = df_all
df_art_2011 = df_art

# 3. Concatenate and clean

In [160]:
df_alls = pd.concat([df_all_2011,df_all_2012,df_all_2013],ignore_index=True)
df_arts = pd.concat([df_art_2011,df_art_2012,df_art_2013],ignore_index=True)

In [167]:
df_alls.head()

Unnamed: 0,lat,lng,total,year,zip_code
0,"6560 39.294832 Name: LAT, dtype: float64","6560 -76.622229 Name: LNG, dtype: float64",17430,2011,21201
1,"6561 39.296526 Name: LAT, dtype: float64","6561 -76.607016 Name: LNG, dtype: float64",16609,2011,21202
2,"6563 39.30229 Name: LAT, dtype: float64","6563 -76.564482 Name: LNG, dtype: float64",139,2011,21205
3,"6564 39.338428 Name: LAT, dtype: float64","6564 -76.538877 Name: LNG, dtype: float64",2,2011,21206
4,"6565 39.324167 Name: LAT, dtype: float64","6565 -76.719484 Name: LNG, dtype: float64",7,2011,21207


In [171]:
df_alls['latitude']=df_alls['lat'].apply(lambda x: float(x.values))
df_alls['longitude']=df_alls['lng'].apply(lambda x: float(x.values))
df_arts['latitude']=df_arts['lat'].apply(lambda x: float(x.values))
df_arts['longitude']=df_arts['lng'].apply(lambda x: float(x.values))

In [177]:
df_alls = df_alls.drop('lat',axis=1)
df_alls = df_alls.drop('lng',axis=1)

In [183]:
df_arts = df_arts.drop('lat',axis=1)
df_arts = df_arts.drop('lng',axis=1)

In [180]:
df_alls.head()

Unnamed: 0,total,year,zip_code,latitude,longitude
0,17430,2011,21201,39.294832,-76.622229
1,16609,2011,21202,39.296526,-76.607016
2,139,2011,21205,39.30229,-76.564482
3,2,2011,21206,39.338428,-76.538877
4,7,2011,21207,39.324167,-76.719484


In [184]:
df_arts.head()

Unnamed: 0,total,year,zip_code,latitude,longitude
0,893,2011,21201,39.294832,-76.622229
1,931,2011,21202,39.296526,-76.607016
2,1,2011,21205,39.30229,-76.564482
3,0,2011,21206,39.338428,-76.538877
4,0,2011,21207,39.324167,-76.719484


# 4. Feature: "art" photo growth rate

In [198]:
# groupby -> elementwise computation
# year: int, zip_code: str
gb = df_arts.groupby(('year','zip_code'))

In [217]:
ref_year = 2011
growth = (int(gb.get_group((ref_year+1,zip_code))['total'].values[0])+.0)/int(gb.get_group((ref_year,zip_code))['total'].values[0])    
print growth

1.06830907055


In [253]:
ref_years = [2011, 2012]
df_art_growth = pd.DataFrame()
for ref_year in ref_years:
    for zip_code in zip_codes:
        try:
            prev_year = int(gb.get_group((ref_year,zip_code))['total'].values[0])
            this_year = int(gb.get_group((ref_year+1,zip_code))['total'].values[0]) 
            if prev_year == 0 & this_year > 0:
                growth = 2
            elif prev_year == 0 & this_year == 0:
                growth = 0
            else:
                growth = (this_year - prev_year+.0)/prev_year
            data = [dict(growth = growth, year = ref_year, zip_code = zip_code)]
            temp = pd.DataFrame(data)
            df_art_growth = pd.concat([df_art_growth,temp],ignore_index=True)
        except KeyError:
            continue                

In [254]:
df_art_growth.head()

Unnamed: 0,growth,year,zip_code
0,0.068309,2011,21201
1,-0.021482,2011,21202
2,0.0,2011,21205
3,0.0,2011,21206
4,0.0,2011,21207


In [255]:
df_art_growth.to_csv('art_rate.csv',encoding='utf-8',index=False)

# other tags

In [14]:
import datetime
from time import time, mktime
years = range(2011,2014)


# for year in years:
start_date = datetime.datetime(year, 1, 1, 0, 0)
start_stamp = int(mktime(start_date.timetuple()))
end_date = datetime.datetime(year+1, 1, 1, 0, 0)
end_stamp = int(mktime(end_date.timetuple()))

df_art = pd.DataFrame()
for year in years:
    for (lat, lng, zip_code) in zip(lats, lngs, zip_codes):
        # all photos
        r_art = flickr.photos_search(text=['artist'], tags_mode='any',sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=2, min_upload_date=start_stamp, max_upload_date=end_stamp)
        data_art = [dict(total = r_art['photos']['total'],
                    year = year,
                    zip_code = zip_code,
                    lat = lat,
                    lng = lng)]        

        df_art_temp = pd.DataFrame(data_art)
        df_art = pd.concat([df_art,df_art_temp],ignore_index=True)        

In [11]:
df_art.to_csv('flickr_hip.csv',encoding='utf-8',index=False)

In [13]:
df_art

Unnamed: 0,lat,lng,total,year,zip_code
0,"6560 39.294832 Name: LAT, dtype: float64","6560 -76.622229 Name: LNG, dtype: float64",2,2011,21201
1,"6561 39.296526 Name: LAT, dtype: float64","6561 -76.607016 Name: LNG, dtype: float64",2,2011,21202
2,"6563 39.30229 Name: LAT, dtype: float64","6563 -76.564482 Name: LNG, dtype: float64",0,2011,21205
3,"6564 39.338428 Name: LAT, dtype: float64","6564 -76.538877 Name: LNG, dtype: float64",0,2011,21206
4,"6565 39.324167 Name: LAT, dtype: float64","6565 -76.719484 Name: LNG, dtype: float64",0,2011,21207
5,"6566 39.381174 Name: LAT, dtype: float64","6566 -76.721002 Name: LNG, dtype: float64",0,2011,21208
6,"6567 39.373191 Name: LAT, dtype: float64","6567 -76.670003 Name: LNG, dtype: float64",0,2011,21209
7,"6568 39.359156 Name: LAT, dtype: float64","6568 -76.632685 Name: LNG, dtype: float64",1,2011,21210
8,"6569 39.329817 Name: LAT, dtype: float64","6569 -76.639408 Name: LNG, dtype: float64",0,2011,21211
9,"6570 39.368561 Name: LAT, dtype: float64","6570 -76.614898 Name: LNG, dtype: float64",1,2011,21212
